core[minor]: Create BaseMedia object (#23639)

This PR implements a BaseContent object from which Document and Blob objects will inherit proposed here: https://github.com/langchain-ai/langchain/pull/23544 Alternative: Create a base object that only has an identifier and no metadata. For now decided against it, since that refactor can be done at a later time. It also feels a bit odd since our IDs are optional at the moment. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
2025-08-13 22:59:05 +00:00 · 2024-07-01 15:07:30 -04:00 · 2024-07-01 15:07:30 -04:00 · e800f6bb57
commit e800f6bb57
parent 04bc5f1a95
5 changed files with 255 additions and 188 deletions
--- a/libs/core/langchain_core/document_loaders/base.py
+++ b/libs/core/langchain_core/document_loaders/base.py
@ -10,7 +10,7 @@ from langchain_core.runnables import run_in_executor
 if TYPE_CHECKING:
    from langchain_text_splitters import TextSplitter

-from langchain_core.document_loaders.blob_loaders import Blob
+from langchain_core.documents.base import Blob


 class BaseLoader(ABC):
--- a/libs/core/langchain_core/document_loaders/blob_loaders.py
+++ b/libs/core/langchain_core/document_loaders/blob_loaders.py
@ -6,175 +6,12 @@ In addition, content loading code should provide a lazy loading interface by def
 """
 from __future__ import annotations

-import contextlib
-import mimetypes
 from abc import ABC, abstractmethod
-from io import BufferedReader, BytesIO
-from pathlib import PurePath
-from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Union, cast
+from typing import Iterable

-from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
-
-PathLike = Union[str, PurePath]
-
-
-class Blob(BaseModel):
-    """Blob represents raw data by either reference or value.
-
-    Provides an interface to materialize the blob in different representations, and
-    help to decouple the development of data loaders from the downstream parsing of
-    the raw data.
-
-    Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
-    """
-
-    data: Union[bytes, str, None]
-    """Raw data associated with the blob."""
-    mimetype: Optional[str] = None
-    """MimeType not to be confused with a file extension."""
-    encoding: str = "utf-8"
-    """Encoding to use if decoding the bytes into a string.
-    
-    Use utf-8 as default encoding, if decoding to string.
-    """
-    path: Optional[PathLike] = None
-    """Location where the original content was found."""
-
-    metadata: Dict[str, Any] = Field(default_factory=dict)
-    """Metadata about the blob (e.g., source)"""
-
-    class Config:
-        arbitrary_types_allowed = True
-        frozen = True
-
-    @property
-    def source(self) -> Optional[str]:
-        """The source location of the blob as string if known otherwise none.
-
-        If a path is associated with the blob, it will default to the path location.
-
-        Unless explicitly set via a metadata field called "source", in which
-        case that value will be used instead.
-        """
-        if self.metadata and "source" in self.metadata:
-            return cast(Optional[str], self.metadata["source"])
-        return str(self.path) if self.path else None
-
-    @root_validator(pre=True)
-    def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
-        """Verify that either data or path is provided."""
-        if "data" not in values and "path" not in values:
-            raise ValueError("Either data or path must be provided")
-        return values
-
-    def as_string(self) -> str:
-        """Read data as a string."""
-        if self.data is None and self.path:
-            with open(str(self.path), "r", encoding=self.encoding) as f:
-                return f.read()
-        elif isinstance(self.data, bytes):
-            return self.data.decode(self.encoding)
-        elif isinstance(self.data, str):
-            return self.data
-        else:
-            raise ValueError(f"Unable to get string for blob {self}")
-
-    def as_bytes(self) -> bytes:
-        """Read data as bytes."""
-        if isinstance(self.data, bytes):
-            return self.data
-        elif isinstance(self.data, str):
-            return self.data.encode(self.encoding)
-        elif self.data is None and self.path:
-            with open(str(self.path), "rb") as f:
-                return f.read()
-        else:
-            raise ValueError(f"Unable to get bytes for blob {self}")
-
-    @contextlib.contextmanager
-    def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
-        """Read data as a byte stream."""
-        if isinstance(self.data, bytes):
-            yield BytesIO(self.data)
-        elif self.data is None and self.path:
-            with open(str(self.path), "rb") as f:
-                yield f
-        else:
-            raise NotImplementedError(f"Unable to convert blob {self}")
-
-    @classmethod
-    def from_path(
-        cls,
-        path: PathLike,
-        *,
-        encoding: str = "utf-8",
-        mime_type: Optional[str] = None,
-        guess_type: bool = True,
-        metadata: Optional[dict] = None,
-    ) -> Blob:
-        """Load the blob from a path like object.
-
-        Args:
-            path: path like object to file to be read
-            encoding: Encoding to use if decoding the bytes into a string
-            mime_type: if provided, will be set as the mime-type of the data
-            guess_type: If True, the mimetype will be guessed from the file extension,
-                        if a mime-type was not provided
-            metadata: Metadata to associate with the blob
-
-        Returns:
-            Blob instance
-        """
-        if mime_type is None and guess_type:
-            _mimetype = mimetypes.guess_type(path)[0] if guess_type else None
-        else:
-            _mimetype = mime_type
-        # We do not load the data immediately, instead we treat the blob as a
-        # reference to the underlying data.
-        return cls(
-            data=None,
-            mimetype=_mimetype,
-            encoding=encoding,
-            path=path,
-            metadata=metadata if metadata is not None else {},
-        )
-
-    @classmethod
-    def from_data(
-        cls,
-        data: Union[str, bytes],
-        *,
-        encoding: str = "utf-8",
-        mime_type: Optional[str] = None,
-        path: Optional[str] = None,
-        metadata: Optional[dict] = None,
-    ) -> Blob:
-        """Initialize the blob from in-memory data.
-
-        Args:
-            data: the in-memory data associated with the blob
-            encoding: Encoding to use if decoding the bytes into a string
-            mime_type: if provided, will be set as the mime-type of the data
-            path: if provided, will be set as the source from which the data came
-            metadata: Metadata to associate with the blob
-
-        Returns:
-            Blob instance
-        """
-        return cls(
-            data=data,
-            mimetype=mime_type,
-            encoding=encoding,
-            path=path,
-            metadata=metadata if metadata is not None else {},
-        )
-
-    def __repr__(self) -> str:
-        """Define the blob representation."""
-        str_repr = f"Blob {id(self)}"
-        if self.source:
-            str_repr += f" {self.source}"
-        return str_repr
+# Re-export Blob and PathLike for backwards compatibility
+from langchain_core.documents.base import Blob as Blob
+from langchain_core.documents.base import PathLike as PathLike


 class BlobLoader(ABC):
@ -193,3 +30,7 @@ class BlobLoader(ABC):
        Returns:
            A generator over blobs
        """
+
+
+# Re-export Blob and Pathlike for backwards compatibility
+__all__ = ["Blob", "BlobLoader", "PathLike"]
--- a/libs/core/langchain_core/documents/base.py
+++ b/libs/core/langchain_core/documents/base.py
@ -1,12 +1,250 @@
 from __future__ import annotations

-from typing import Any, List, Literal, Optional
+import contextlib
+import mimetypes
+from io import BufferedReader, BytesIO
+from pathlib import PurePath
+from typing import Any, Generator, List, Literal, Mapping, Optional, Union, cast

 from langchain_core.load.serializable import Serializable
-from langchain_core.pydantic_v1 import Field
+from langchain_core.pydantic_v1 import Field, root_validator
+
+PathLike = Union[str, PurePath]


-class Document(Serializable):
+class BaseMedia(Serializable):
+    """Use to represent media content.
+
+    Media objets can be used to represent raw data, such as text or binary data.
+
+    LangChain Media objects allow associating metadata and an optional identifier
+    with the content.
+
+    The presence of an ID and metadata make it easier to store, index, and search
+    over the content in a structured way.
+    """
+
+    # The ID field is optional at the moment.
+    # It will likely become required in a future major release after
+    # it has been adopted by enough vectorstore implementations.
+    id: Optional[str] = None
+    """An optional identifier for the document.
+
+    Ideally this should be unique across the document collection and formatted 
+    as a UUID, but this will not be enforced.
+    
+    .. versionadded:: 0.2.11
+    """
+
+    metadata: dict = Field(default_factory=dict)
+    """Arbitrary metadata associated with the content."""
+
+
+class Blob(BaseMedia):
+    """Blob represents raw data by either reference or value.
+
+    Provides an interface to materialize the blob in different representations, and
+    help to decouple the development of data loaders from the downstream parsing of
+    the raw data.
+
+    Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
+
+    Example: Initialize a blob from in-memory data
+
+        .. code-block:: python
+
+            from langchain_core.documents import Blob
+
+            blob = Blob.from_data("Hello, world!")
+
+            # Read the blob as a string
+            print(blob.as_string())
+
+            # Read the blob as bytes
+            print(blob.as_bytes())
+
+            # Read the blob as a byte stream
+            with blob.as_bytes_io() as f:
+                print(f.read())
+
+    Example: Load from memory and specify mime-type and metadata
+
+        .. code-block:: python
+
+            from langchain_core.documents import Blob
+
+            blob = Blob.from_data(
+                data="Hello, world!",
+                mime_type="text/plain",
+                metadata={"source": "https://example.com"}
+            )
+
+    Example: Load the blob from a file
+
+        .. code-block:: python
+
+            from langchain_core.documents import Blob
+
+            blob = Blob.from_path("path/to/file.txt")
+
+            # Read the blob as a string
+            print(blob.as_string())
+
+            # Read the blob as bytes
+            print(blob.as_bytes())
+
+            # Read the blob as a byte stream
+            with blob.as_bytes_io() as f:
+                print(f.read())
+    """
+
+    data: Union[bytes, str, None]
+    """Raw data associated with the blob."""
+    mimetype: Optional[str] = None
+    """MimeType not to be confused with a file extension."""
+    encoding: str = "utf-8"
+    """Encoding to use if decoding the bytes into a string.
+
+    Use utf-8 as default encoding, if decoding to string.
+    """
+    path: Optional[PathLike] = None
+    """Location where the original content was found."""
+
+    class Config:
+        arbitrary_types_allowed = True
+        frozen = True
+
+    @property
+    def source(self) -> Optional[str]:
+        """The source location of the blob as string if known otherwise none.
+
+        If a path is associated with the blob, it will default to the path location.
+
+        Unless explicitly set via a metadata field called "source", in which
+        case that value will be used instead.
+        """
+        if self.metadata and "source" in self.metadata:
+            return cast(Optional[str], self.metadata["source"])
+        return str(self.path) if self.path else None
+
+    @root_validator(pre=True)
+    def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
+        """Verify that either data or path is provided."""
+        if "data" not in values and "path" not in values:
+            raise ValueError("Either data or path must be provided")
+        return values
+
+    def as_string(self) -> str:
+        """Read data as a string."""
+        if self.data is None and self.path:
+            with open(str(self.path), "r", encoding=self.encoding) as f:
+                return f.read()
+        elif isinstance(self.data, bytes):
+            return self.data.decode(self.encoding)
+        elif isinstance(self.data, str):
+            return self.data
+        else:
+            raise ValueError(f"Unable to get string for blob {self}")
+
+    def as_bytes(self) -> bytes:
+        """Read data as bytes."""
+        if isinstance(self.data, bytes):
+            return self.data
+        elif isinstance(self.data, str):
+            return self.data.encode(self.encoding)
+        elif self.data is None and self.path:
+            with open(str(self.path), "rb") as f:
+                return f.read()
+        else:
+            raise ValueError(f"Unable to get bytes for blob {self}")
+
+    @contextlib.contextmanager
+    def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
+        """Read data as a byte stream."""
+        if isinstance(self.data, bytes):
+            yield BytesIO(self.data)
+        elif self.data is None and self.path:
+            with open(str(self.path), "rb") as f:
+                yield f
+        else:
+            raise NotImplementedError(f"Unable to convert blob {self}")
+
+    @classmethod
+    def from_path(
+        cls,
+        path: PathLike,
+        *,
+        encoding: str = "utf-8",
+        mime_type: Optional[str] = None,
+        guess_type: bool = True,
+        metadata: Optional[dict] = None,
+    ) -> Blob:
+        """Load the blob from a path like object.
+
+        Args:
+            path: path like object to file to be read
+            encoding: Encoding to use if decoding the bytes into a string
+            mime_type: if provided, will be set as the mime-type of the data
+            guess_type: If True, the mimetype will be guessed from the file extension,
+                        if a mime-type was not provided
+            metadata: Metadata to associate with the blob
+
+        Returns:
+            Blob instance
+        """
+        if mime_type is None and guess_type:
+            _mimetype = mimetypes.guess_type(path)[0] if guess_type else None
+        else:
+            _mimetype = mime_type
+        # We do not load the data immediately, instead we treat the blob as a
+        # reference to the underlying data.
+        return cls(
+            data=None,
+            mimetype=_mimetype,
+            encoding=encoding,
+            path=path,
+            metadata=metadata if metadata is not None else {},
+        )
+
+    @classmethod
+    def from_data(
+        cls,
+        data: Union[str, bytes],
+        *,
+        encoding: str = "utf-8",
+        mime_type: Optional[str] = None,
+        path: Optional[str] = None,
+        metadata: Optional[dict] = None,
+    ) -> Blob:
+        """Initialize the blob from in-memory data.
+
+        Args:
+            data: the in-memory data associated with the blob
+            encoding: Encoding to use if decoding the bytes into a string
+            mime_type: if provided, will be set as the mime-type of the data
+            path: if provided, will be set as the source from which the data came
+            metadata: Metadata to associate with the blob
+
+        Returns:
+            Blob instance
+        """
+        return cls(
+            data=data,
+            mimetype=mime_type,
+            encoding=encoding,
+            path=path,
+            metadata=metadata if metadata is not None else {},
+        )
+
+    def __repr__(self) -> str:
+        """Define the blob representation."""
+        str_repr = f"Blob {id(self)}"
+        if self.source:
+            str_repr += f" {self.source}"
+        return str_repr
+
+
+class Document(BaseMedia):
    """Class for storing a piece of text and associated metadata.

    Example:
@ -21,27 +259,15 @@ class Document(Serializable):
            )
    """

-    # The ID field is optional at the moment.
-    # It will likely become required in a future major release after
-    # it has been adopted by enough vectorstore implementations.
-    id: Optional[str] = None
-    """An optional identifier for the document.
-    
-    Ideally this should be unique across the document collection and formatted 
-    as a UUID, but this will not be enforced.
-    """
-
    page_content: str
    """String text."""
-    metadata: dict = Field(default_factory=dict)
-    """Arbitrary metadata about the page content (e.g., source, relationships to other
-        documents, etc.).
-    """
    type: Literal["Document"] = "Document"

    def __init__(self, page_content: str, **kwargs: Any) -> None:
        """Pass page_content in as positional or named arg."""
-        super().__init__(page_content=page_content, **kwargs)
+        # my-py is complaining that page_content is not defined on the base class.
+        # Here, we're relying on pydantic base class to handle the validation.
+        super().__init__(page_content=page_content, **kwargs)  # type: ignore[call-arg]

    @classmethod
    def is_lc_serializable(cls) -> bool:
--- a/libs/core/tests/unit_tests/document_loaders/test_base.py
+++ b/libs/core/tests/unit_tests/document_loaders/test_base.py
@ -4,8 +4,8 @@ from typing import Iterator, List
 import pytest

 from langchain_core.document_loaders.base import BaseBlobParser, BaseLoader
-from langchain_core.document_loaders.blob_loaders import Blob
 from langchain_core.documents import Document
+from langchain_core.documents.base import Blob


 def test_base_blob_parser() -> None:
--- a/libs/core/tests/unit_tests/documents/test_str.py
+++ b/libs/core/tests/unit_tests/documents/test_str.py
@ -16,5 +16,5 @@ def test_repr() -> None:
    )
    assert (
        repr(Document(page_content="Hello, World!", metadata={"a": 3}))
-        == "Document(page_content='Hello, World!', metadata={'a': 3})"
+        == "Document(metadata={'a': 3}, page_content='Hello, World!')"
    )