mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 08:58:48 +00:00
core[minor]: Create BaseMedia object (#23639)
This PR implements a BaseContent object from which Document and Blob objects will inherit proposed here: https://github.com/langchain-ai/langchain/pull/23544 Alternative: Create a base object that only has an identifier and no metadata. For now decided against it, since that refactor can be done at a later time. It also feels a bit odd since our IDs are optional at the moment. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
04bc5f1a95
commit
e800f6bb57
@ -10,7 +10,7 @@ from langchain_core.runnables import run_in_executor
|
||||
if TYPE_CHECKING:
|
||||
from langchain_text_splitters import TextSplitter
|
||||
|
||||
from langchain_core.document_loaders.blob_loaders import Blob
|
||||
from langchain_core.documents.base import Blob
|
||||
|
||||
|
||||
class BaseLoader(ABC):
|
||||
|
@ -6,175 +6,12 @@ In addition, content loading code should provide a lazy loading interface by def
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import mimetypes
|
||||
from abc import ABC, abstractmethod
|
||||
from io import BufferedReader, BytesIO
|
||||
from pathlib import PurePath
|
||||
from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Union, cast
|
||||
from typing import Iterable
|
||||
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
|
||||
|
||||
PathLike = Union[str, PurePath]
|
||||
|
||||
|
||||
class Blob(BaseModel):
|
||||
"""Blob represents raw data by either reference or value.
|
||||
|
||||
Provides an interface to materialize the blob in different representations, and
|
||||
help to decouple the development of data loaders from the downstream parsing of
|
||||
the raw data.
|
||||
|
||||
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
|
||||
"""
|
||||
|
||||
data: Union[bytes, str, None]
|
||||
"""Raw data associated with the blob."""
|
||||
mimetype: Optional[str] = None
|
||||
"""MimeType not to be confused with a file extension."""
|
||||
encoding: str = "utf-8"
|
||||
"""Encoding to use if decoding the bytes into a string.
|
||||
|
||||
Use utf-8 as default encoding, if decoding to string.
|
||||
"""
|
||||
path: Optional[PathLike] = None
|
||||
"""Location where the original content was found."""
|
||||
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
"""Metadata about the blob (e.g., source)"""
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
frozen = True
|
||||
|
||||
@property
|
||||
def source(self) -> Optional[str]:
|
||||
"""The source location of the blob as string if known otherwise none.
|
||||
|
||||
If a path is associated with the blob, it will default to the path location.
|
||||
|
||||
Unless explicitly set via a metadata field called "source", in which
|
||||
case that value will be used instead.
|
||||
"""
|
||||
if self.metadata and "source" in self.metadata:
|
||||
return cast(Optional[str], self.metadata["source"])
|
||||
return str(self.path) if self.path else None
|
||||
|
||||
@root_validator(pre=True)
|
||||
def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||
"""Verify that either data or path is provided."""
|
||||
if "data" not in values and "path" not in values:
|
||||
raise ValueError("Either data or path must be provided")
|
||||
return values
|
||||
|
||||
def as_string(self) -> str:
|
||||
"""Read data as a string."""
|
||||
if self.data is None and self.path:
|
||||
with open(str(self.path), "r", encoding=self.encoding) as f:
|
||||
return f.read()
|
||||
elif isinstance(self.data, bytes):
|
||||
return self.data.decode(self.encoding)
|
||||
elif isinstance(self.data, str):
|
||||
return self.data
|
||||
else:
|
||||
raise ValueError(f"Unable to get string for blob {self}")
|
||||
|
||||
def as_bytes(self) -> bytes:
|
||||
"""Read data as bytes."""
|
||||
if isinstance(self.data, bytes):
|
||||
return self.data
|
||||
elif isinstance(self.data, str):
|
||||
return self.data.encode(self.encoding)
|
||||
elif self.data is None and self.path:
|
||||
with open(str(self.path), "rb") as f:
|
||||
return f.read()
|
||||
else:
|
||||
raise ValueError(f"Unable to get bytes for blob {self}")
|
||||
|
||||
@contextlib.contextmanager
|
||||
def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
|
||||
"""Read data as a byte stream."""
|
||||
if isinstance(self.data, bytes):
|
||||
yield BytesIO(self.data)
|
||||
elif self.data is None and self.path:
|
||||
with open(str(self.path), "rb") as f:
|
||||
yield f
|
||||
else:
|
||||
raise NotImplementedError(f"Unable to convert blob {self}")
|
||||
|
||||
@classmethod
|
||||
def from_path(
|
||||
cls,
|
||||
path: PathLike,
|
||||
*,
|
||||
encoding: str = "utf-8",
|
||||
mime_type: Optional[str] = None,
|
||||
guess_type: bool = True,
|
||||
metadata: Optional[dict] = None,
|
||||
) -> Blob:
|
||||
"""Load the blob from a path like object.
|
||||
|
||||
Args:
|
||||
path: path like object to file to be read
|
||||
encoding: Encoding to use if decoding the bytes into a string
|
||||
mime_type: if provided, will be set as the mime-type of the data
|
||||
guess_type: If True, the mimetype will be guessed from the file extension,
|
||||
if a mime-type was not provided
|
||||
metadata: Metadata to associate with the blob
|
||||
|
||||
Returns:
|
||||
Blob instance
|
||||
"""
|
||||
if mime_type is None and guess_type:
|
||||
_mimetype = mimetypes.guess_type(path)[0] if guess_type else None
|
||||
else:
|
||||
_mimetype = mime_type
|
||||
# We do not load the data immediately, instead we treat the blob as a
|
||||
# reference to the underlying data.
|
||||
return cls(
|
||||
data=None,
|
||||
mimetype=_mimetype,
|
||||
encoding=encoding,
|
||||
path=path,
|
||||
metadata=metadata if metadata is not None else {},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_data(
|
||||
cls,
|
||||
data: Union[str, bytes],
|
||||
*,
|
||||
encoding: str = "utf-8",
|
||||
mime_type: Optional[str] = None,
|
||||
path: Optional[str] = None,
|
||||
metadata: Optional[dict] = None,
|
||||
) -> Blob:
|
||||
"""Initialize the blob from in-memory data.
|
||||
|
||||
Args:
|
||||
data: the in-memory data associated with the blob
|
||||
encoding: Encoding to use if decoding the bytes into a string
|
||||
mime_type: if provided, will be set as the mime-type of the data
|
||||
path: if provided, will be set as the source from which the data came
|
||||
metadata: Metadata to associate with the blob
|
||||
|
||||
Returns:
|
||||
Blob instance
|
||||
"""
|
||||
return cls(
|
||||
data=data,
|
||||
mimetype=mime_type,
|
||||
encoding=encoding,
|
||||
path=path,
|
||||
metadata=metadata if metadata is not None else {},
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""Define the blob representation."""
|
||||
str_repr = f"Blob {id(self)}"
|
||||
if self.source:
|
||||
str_repr += f" {self.source}"
|
||||
return str_repr
|
||||
# Re-export Blob and PathLike for backwards compatibility
|
||||
from langchain_core.documents.base import Blob as Blob
|
||||
from langchain_core.documents.base import PathLike as PathLike
|
||||
|
||||
|
||||
class BlobLoader(ABC):
|
||||
@ -193,3 +30,7 @@ class BlobLoader(ABC):
|
||||
Returns:
|
||||
A generator over blobs
|
||||
"""
|
||||
|
||||
|
||||
# Re-export Blob and Pathlike for backwards compatibility
|
||||
__all__ = ["Blob", "BlobLoader", "PathLike"]
|
||||
|
@ -1,12 +1,250 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, List, Literal, Optional
|
||||
import contextlib
|
||||
import mimetypes
|
||||
from io import BufferedReader, BytesIO
|
||||
from pathlib import PurePath
|
||||
from typing import Any, Generator, List, Literal, Mapping, Optional, Union, cast
|
||||
|
||||
from langchain_core.load.serializable import Serializable
|
||||
from langchain_core.pydantic_v1 import Field
|
||||
from langchain_core.pydantic_v1 import Field, root_validator
|
||||
|
||||
PathLike = Union[str, PurePath]
|
||||
|
||||
|
||||
class Document(Serializable):
|
||||
class BaseMedia(Serializable):
|
||||
"""Use to represent media content.
|
||||
|
||||
Media objets can be used to represent raw data, such as text or binary data.
|
||||
|
||||
LangChain Media objects allow associating metadata and an optional identifier
|
||||
with the content.
|
||||
|
||||
The presence of an ID and metadata make it easier to store, index, and search
|
||||
over the content in a structured way.
|
||||
"""
|
||||
|
||||
# The ID field is optional at the moment.
|
||||
# It will likely become required in a future major release after
|
||||
# it has been adopted by enough vectorstore implementations.
|
||||
id: Optional[str] = None
|
||||
"""An optional identifier for the document.
|
||||
|
||||
Ideally this should be unique across the document collection and formatted
|
||||
as a UUID, but this will not be enforced.
|
||||
|
||||
.. versionadded:: 0.2.11
|
||||
"""
|
||||
|
||||
metadata: dict = Field(default_factory=dict)
|
||||
"""Arbitrary metadata associated with the content."""
|
||||
|
||||
|
||||
class Blob(BaseMedia):
|
||||
"""Blob represents raw data by either reference or value.
|
||||
|
||||
Provides an interface to materialize the blob in different representations, and
|
||||
help to decouple the development of data loaders from the downstream parsing of
|
||||
the raw data.
|
||||
|
||||
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
|
||||
|
||||
Example: Initialize a blob from in-memory data
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.documents import Blob
|
||||
|
||||
blob = Blob.from_data("Hello, world!")
|
||||
|
||||
# Read the blob as a string
|
||||
print(blob.as_string())
|
||||
|
||||
# Read the blob as bytes
|
||||
print(blob.as_bytes())
|
||||
|
||||
# Read the blob as a byte stream
|
||||
with blob.as_bytes_io() as f:
|
||||
print(f.read())
|
||||
|
||||
Example: Load from memory and specify mime-type and metadata
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.documents import Blob
|
||||
|
||||
blob = Blob.from_data(
|
||||
data="Hello, world!",
|
||||
mime_type="text/plain",
|
||||
metadata={"source": "https://example.com"}
|
||||
)
|
||||
|
||||
Example: Load the blob from a file
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.documents import Blob
|
||||
|
||||
blob = Blob.from_path("path/to/file.txt")
|
||||
|
||||
# Read the blob as a string
|
||||
print(blob.as_string())
|
||||
|
||||
# Read the blob as bytes
|
||||
print(blob.as_bytes())
|
||||
|
||||
# Read the blob as a byte stream
|
||||
with blob.as_bytes_io() as f:
|
||||
print(f.read())
|
||||
"""
|
||||
|
||||
data: Union[bytes, str, None]
|
||||
"""Raw data associated with the blob."""
|
||||
mimetype: Optional[str] = None
|
||||
"""MimeType not to be confused with a file extension."""
|
||||
encoding: str = "utf-8"
|
||||
"""Encoding to use if decoding the bytes into a string.
|
||||
|
||||
Use utf-8 as default encoding, if decoding to string.
|
||||
"""
|
||||
path: Optional[PathLike] = None
|
||||
"""Location where the original content was found."""
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
frozen = True
|
||||
|
||||
@property
|
||||
def source(self) -> Optional[str]:
|
||||
"""The source location of the blob as string if known otherwise none.
|
||||
|
||||
If a path is associated with the blob, it will default to the path location.
|
||||
|
||||
Unless explicitly set via a metadata field called "source", in which
|
||||
case that value will be used instead.
|
||||
"""
|
||||
if self.metadata and "source" in self.metadata:
|
||||
return cast(Optional[str], self.metadata["source"])
|
||||
return str(self.path) if self.path else None
|
||||
|
||||
@root_validator(pre=True)
|
||||
def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||
"""Verify that either data or path is provided."""
|
||||
if "data" not in values and "path" not in values:
|
||||
raise ValueError("Either data or path must be provided")
|
||||
return values
|
||||
|
||||
def as_string(self) -> str:
|
||||
"""Read data as a string."""
|
||||
if self.data is None and self.path:
|
||||
with open(str(self.path), "r", encoding=self.encoding) as f:
|
||||
return f.read()
|
||||
elif isinstance(self.data, bytes):
|
||||
return self.data.decode(self.encoding)
|
||||
elif isinstance(self.data, str):
|
||||
return self.data
|
||||
else:
|
||||
raise ValueError(f"Unable to get string for blob {self}")
|
||||
|
||||
def as_bytes(self) -> bytes:
|
||||
"""Read data as bytes."""
|
||||
if isinstance(self.data, bytes):
|
||||
return self.data
|
||||
elif isinstance(self.data, str):
|
||||
return self.data.encode(self.encoding)
|
||||
elif self.data is None and self.path:
|
||||
with open(str(self.path), "rb") as f:
|
||||
return f.read()
|
||||
else:
|
||||
raise ValueError(f"Unable to get bytes for blob {self}")
|
||||
|
||||
@contextlib.contextmanager
|
||||
def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
|
||||
"""Read data as a byte stream."""
|
||||
if isinstance(self.data, bytes):
|
||||
yield BytesIO(self.data)
|
||||
elif self.data is None and self.path:
|
||||
with open(str(self.path), "rb") as f:
|
||||
yield f
|
||||
else:
|
||||
raise NotImplementedError(f"Unable to convert blob {self}")
|
||||
|
||||
@classmethod
|
||||
def from_path(
|
||||
cls,
|
||||
path: PathLike,
|
||||
*,
|
||||
encoding: str = "utf-8",
|
||||
mime_type: Optional[str] = None,
|
||||
guess_type: bool = True,
|
||||
metadata: Optional[dict] = None,
|
||||
) -> Blob:
|
||||
"""Load the blob from a path like object.
|
||||
|
||||
Args:
|
||||
path: path like object to file to be read
|
||||
encoding: Encoding to use if decoding the bytes into a string
|
||||
mime_type: if provided, will be set as the mime-type of the data
|
||||
guess_type: If True, the mimetype will be guessed from the file extension,
|
||||
if a mime-type was not provided
|
||||
metadata: Metadata to associate with the blob
|
||||
|
||||
Returns:
|
||||
Blob instance
|
||||
"""
|
||||
if mime_type is None and guess_type:
|
||||
_mimetype = mimetypes.guess_type(path)[0] if guess_type else None
|
||||
else:
|
||||
_mimetype = mime_type
|
||||
# We do not load the data immediately, instead we treat the blob as a
|
||||
# reference to the underlying data.
|
||||
return cls(
|
||||
data=None,
|
||||
mimetype=_mimetype,
|
||||
encoding=encoding,
|
||||
path=path,
|
||||
metadata=metadata if metadata is not None else {},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_data(
|
||||
cls,
|
||||
data: Union[str, bytes],
|
||||
*,
|
||||
encoding: str = "utf-8",
|
||||
mime_type: Optional[str] = None,
|
||||
path: Optional[str] = None,
|
||||
metadata: Optional[dict] = None,
|
||||
) -> Blob:
|
||||
"""Initialize the blob from in-memory data.
|
||||
|
||||
Args:
|
||||
data: the in-memory data associated with the blob
|
||||
encoding: Encoding to use if decoding the bytes into a string
|
||||
mime_type: if provided, will be set as the mime-type of the data
|
||||
path: if provided, will be set as the source from which the data came
|
||||
metadata: Metadata to associate with the blob
|
||||
|
||||
Returns:
|
||||
Blob instance
|
||||
"""
|
||||
return cls(
|
||||
data=data,
|
||||
mimetype=mime_type,
|
||||
encoding=encoding,
|
||||
path=path,
|
||||
metadata=metadata if metadata is not None else {},
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""Define the blob representation."""
|
||||
str_repr = f"Blob {id(self)}"
|
||||
if self.source:
|
||||
str_repr += f" {self.source}"
|
||||
return str_repr
|
||||
|
||||
|
||||
class Document(BaseMedia):
|
||||
"""Class for storing a piece of text and associated metadata.
|
||||
|
||||
Example:
|
||||
@ -21,27 +259,15 @@ class Document(Serializable):
|
||||
)
|
||||
"""
|
||||
|
||||
# The ID field is optional at the moment.
|
||||
# It will likely become required in a future major release after
|
||||
# it has been adopted by enough vectorstore implementations.
|
||||
id: Optional[str] = None
|
||||
"""An optional identifier for the document.
|
||||
|
||||
Ideally this should be unique across the document collection and formatted
|
||||
as a UUID, but this will not be enforced.
|
||||
"""
|
||||
|
||||
page_content: str
|
||||
"""String text."""
|
||||
metadata: dict = Field(default_factory=dict)
|
||||
"""Arbitrary metadata about the page content (e.g., source, relationships to other
|
||||
documents, etc.).
|
||||
"""
|
||||
type: Literal["Document"] = "Document"
|
||||
|
||||
def __init__(self, page_content: str, **kwargs: Any) -> None:
|
||||
"""Pass page_content in as positional or named arg."""
|
||||
super().__init__(page_content=page_content, **kwargs)
|
||||
# my-py is complaining that page_content is not defined on the base class.
|
||||
# Here, we're relying on pydantic base class to handle the validation.
|
||||
super().__init__(page_content=page_content, **kwargs) # type: ignore[call-arg]
|
||||
|
||||
@classmethod
|
||||
def is_lc_serializable(cls) -> bool:
|
||||
|
@ -4,8 +4,8 @@ from typing import Iterator, List
|
||||
import pytest
|
||||
|
||||
from langchain_core.document_loaders.base import BaseBlobParser, BaseLoader
|
||||
from langchain_core.document_loaders.blob_loaders import Blob
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.documents.base import Blob
|
||||
|
||||
|
||||
def test_base_blob_parser() -> None:
|
||||
|
@ -16,5 +16,5 @@ def test_repr() -> None:
|
||||
)
|
||||
assert (
|
||||
repr(Document(page_content="Hello, World!", metadata={"a": 3}))
|
||||
== "Document(page_content='Hello, World!', metadata={'a': 3})"
|
||||
== "Document(metadata={'a': 3}, page_content='Hello, World!')"
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user