mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 08:58:48 +00:00
core[minor]: Create BaseMedia object (#23639)
This PR implements a BaseContent object from which Document and Blob objects will inherit proposed here: https://github.com/langchain-ai/langchain/pull/23544 Alternative: Create a base object that only has an identifier and no metadata. For now decided against it, since that refactor can be done at a later time. It also feels a bit odd since our IDs are optional at the moment. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
04bc5f1a95
commit
e800f6bb57
@ -10,7 +10,7 @@ from langchain_core.runnables import run_in_executor
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from langchain_text_splitters import TextSplitter
|
from langchain_text_splitters import TextSplitter
|
||||||
|
|
||||||
from langchain_core.document_loaders.blob_loaders import Blob
|
from langchain_core.documents.base import Blob
|
||||||
|
|
||||||
|
|
||||||
class BaseLoader(ABC):
|
class BaseLoader(ABC):
|
||||||
|
@ -6,175 +6,12 @@ In addition, content loading code should provide a lazy loading interface by def
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import contextlib
|
|
||||||
import mimetypes
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from io import BufferedReader, BytesIO
|
from typing import Iterable
|
||||||
from pathlib import PurePath
|
|
||||||
from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Union, cast
|
|
||||||
|
|
||||||
from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
|
# Re-export Blob and PathLike for backwards compatibility
|
||||||
|
from langchain_core.documents.base import Blob as Blob
|
||||||
PathLike = Union[str, PurePath]
|
from langchain_core.documents.base import PathLike as PathLike
|
||||||
|
|
||||||
|
|
||||||
class Blob(BaseModel):
|
|
||||||
"""Blob represents raw data by either reference or value.
|
|
||||||
|
|
||||||
Provides an interface to materialize the blob in different representations, and
|
|
||||||
help to decouple the development of data loaders from the downstream parsing of
|
|
||||||
the raw data.
|
|
||||||
|
|
||||||
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
|
|
||||||
"""
|
|
||||||
|
|
||||||
data: Union[bytes, str, None]
|
|
||||||
"""Raw data associated with the blob."""
|
|
||||||
mimetype: Optional[str] = None
|
|
||||||
"""MimeType not to be confused with a file extension."""
|
|
||||||
encoding: str = "utf-8"
|
|
||||||
"""Encoding to use if decoding the bytes into a string.
|
|
||||||
|
|
||||||
Use utf-8 as default encoding, if decoding to string.
|
|
||||||
"""
|
|
||||||
path: Optional[PathLike] = None
|
|
||||||
"""Location where the original content was found."""
|
|
||||||
|
|
||||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
||||||
"""Metadata about the blob (e.g., source)"""
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
arbitrary_types_allowed = True
|
|
||||||
frozen = True
|
|
||||||
|
|
||||||
@property
|
|
||||||
def source(self) -> Optional[str]:
|
|
||||||
"""The source location of the blob as string if known otherwise none.
|
|
||||||
|
|
||||||
If a path is associated with the blob, it will default to the path location.
|
|
||||||
|
|
||||||
Unless explicitly set via a metadata field called "source", in which
|
|
||||||
case that value will be used instead.
|
|
||||||
"""
|
|
||||||
if self.metadata and "source" in self.metadata:
|
|
||||||
return cast(Optional[str], self.metadata["source"])
|
|
||||||
return str(self.path) if self.path else None
|
|
||||||
|
|
||||||
@root_validator(pre=True)
|
|
||||||
def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
||||||
"""Verify that either data or path is provided."""
|
|
||||||
if "data" not in values and "path" not in values:
|
|
||||||
raise ValueError("Either data or path must be provided")
|
|
||||||
return values
|
|
||||||
|
|
||||||
def as_string(self) -> str:
|
|
||||||
"""Read data as a string."""
|
|
||||||
if self.data is None and self.path:
|
|
||||||
with open(str(self.path), "r", encoding=self.encoding) as f:
|
|
||||||
return f.read()
|
|
||||||
elif isinstance(self.data, bytes):
|
|
||||||
return self.data.decode(self.encoding)
|
|
||||||
elif isinstance(self.data, str):
|
|
||||||
return self.data
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unable to get string for blob {self}")
|
|
||||||
|
|
||||||
def as_bytes(self) -> bytes:
|
|
||||||
"""Read data as bytes."""
|
|
||||||
if isinstance(self.data, bytes):
|
|
||||||
return self.data
|
|
||||||
elif isinstance(self.data, str):
|
|
||||||
return self.data.encode(self.encoding)
|
|
||||||
elif self.data is None and self.path:
|
|
||||||
with open(str(self.path), "rb") as f:
|
|
||||||
return f.read()
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unable to get bytes for blob {self}")
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
|
||||||
def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
|
|
||||||
"""Read data as a byte stream."""
|
|
||||||
if isinstance(self.data, bytes):
|
|
||||||
yield BytesIO(self.data)
|
|
||||||
elif self.data is None and self.path:
|
|
||||||
with open(str(self.path), "rb") as f:
|
|
||||||
yield f
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(f"Unable to convert blob {self}")
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_path(
|
|
||||||
cls,
|
|
||||||
path: PathLike,
|
|
||||||
*,
|
|
||||||
encoding: str = "utf-8",
|
|
||||||
mime_type: Optional[str] = None,
|
|
||||||
guess_type: bool = True,
|
|
||||||
metadata: Optional[dict] = None,
|
|
||||||
) -> Blob:
|
|
||||||
"""Load the blob from a path like object.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path: path like object to file to be read
|
|
||||||
encoding: Encoding to use if decoding the bytes into a string
|
|
||||||
mime_type: if provided, will be set as the mime-type of the data
|
|
||||||
guess_type: If True, the mimetype will be guessed from the file extension,
|
|
||||||
if a mime-type was not provided
|
|
||||||
metadata: Metadata to associate with the blob
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Blob instance
|
|
||||||
"""
|
|
||||||
if mime_type is None and guess_type:
|
|
||||||
_mimetype = mimetypes.guess_type(path)[0] if guess_type else None
|
|
||||||
else:
|
|
||||||
_mimetype = mime_type
|
|
||||||
# We do not load the data immediately, instead we treat the blob as a
|
|
||||||
# reference to the underlying data.
|
|
||||||
return cls(
|
|
||||||
data=None,
|
|
||||||
mimetype=_mimetype,
|
|
||||||
encoding=encoding,
|
|
||||||
path=path,
|
|
||||||
metadata=metadata if metadata is not None else {},
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_data(
|
|
||||||
cls,
|
|
||||||
data: Union[str, bytes],
|
|
||||||
*,
|
|
||||||
encoding: str = "utf-8",
|
|
||||||
mime_type: Optional[str] = None,
|
|
||||||
path: Optional[str] = None,
|
|
||||||
metadata: Optional[dict] = None,
|
|
||||||
) -> Blob:
|
|
||||||
"""Initialize the blob from in-memory data.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
data: the in-memory data associated with the blob
|
|
||||||
encoding: Encoding to use if decoding the bytes into a string
|
|
||||||
mime_type: if provided, will be set as the mime-type of the data
|
|
||||||
path: if provided, will be set as the source from which the data came
|
|
||||||
metadata: Metadata to associate with the blob
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Blob instance
|
|
||||||
"""
|
|
||||||
return cls(
|
|
||||||
data=data,
|
|
||||||
mimetype=mime_type,
|
|
||||||
encoding=encoding,
|
|
||||||
path=path,
|
|
||||||
metadata=metadata if metadata is not None else {},
|
|
||||||
)
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
"""Define the blob representation."""
|
|
||||||
str_repr = f"Blob {id(self)}"
|
|
||||||
if self.source:
|
|
||||||
str_repr += f" {self.source}"
|
|
||||||
return str_repr
|
|
||||||
|
|
||||||
|
|
||||||
class BlobLoader(ABC):
|
class BlobLoader(ABC):
|
||||||
@ -193,3 +30,7 @@ class BlobLoader(ABC):
|
|||||||
Returns:
|
Returns:
|
||||||
A generator over blobs
|
A generator over blobs
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# Re-export Blob and Pathlike for backwards compatibility
|
||||||
|
__all__ = ["Blob", "BlobLoader", "PathLike"]
|
||||||
|
@ -1,12 +1,250 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Any, List, Literal, Optional
|
import contextlib
|
||||||
|
import mimetypes
|
||||||
|
from io import BufferedReader, BytesIO
|
||||||
|
from pathlib import PurePath
|
||||||
|
from typing import Any, Generator, List, Literal, Mapping, Optional, Union, cast
|
||||||
|
|
||||||
from langchain_core.load.serializable import Serializable
|
from langchain_core.load.serializable import Serializable
|
||||||
from langchain_core.pydantic_v1 import Field
|
from langchain_core.pydantic_v1 import Field, root_validator
|
||||||
|
|
||||||
|
PathLike = Union[str, PurePath]
|
||||||
|
|
||||||
|
|
||||||
class Document(Serializable):
|
class BaseMedia(Serializable):
|
||||||
|
"""Use to represent media content.
|
||||||
|
|
||||||
|
Media objets can be used to represent raw data, such as text or binary data.
|
||||||
|
|
||||||
|
LangChain Media objects allow associating metadata and an optional identifier
|
||||||
|
with the content.
|
||||||
|
|
||||||
|
The presence of an ID and metadata make it easier to store, index, and search
|
||||||
|
over the content in a structured way.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# The ID field is optional at the moment.
|
||||||
|
# It will likely become required in a future major release after
|
||||||
|
# it has been adopted by enough vectorstore implementations.
|
||||||
|
id: Optional[str] = None
|
||||||
|
"""An optional identifier for the document.
|
||||||
|
|
||||||
|
Ideally this should be unique across the document collection and formatted
|
||||||
|
as a UUID, but this will not be enforced.
|
||||||
|
|
||||||
|
.. versionadded:: 0.2.11
|
||||||
|
"""
|
||||||
|
|
||||||
|
metadata: dict = Field(default_factory=dict)
|
||||||
|
"""Arbitrary metadata associated with the content."""
|
||||||
|
|
||||||
|
|
||||||
|
class Blob(BaseMedia):
|
||||||
|
"""Blob represents raw data by either reference or value.
|
||||||
|
|
||||||
|
Provides an interface to materialize the blob in different representations, and
|
||||||
|
help to decouple the development of data loaders from the downstream parsing of
|
||||||
|
the raw data.
|
||||||
|
|
||||||
|
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
|
||||||
|
|
||||||
|
Example: Initialize a blob from in-memory data
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_core.documents import Blob
|
||||||
|
|
||||||
|
blob = Blob.from_data("Hello, world!")
|
||||||
|
|
||||||
|
# Read the blob as a string
|
||||||
|
print(blob.as_string())
|
||||||
|
|
||||||
|
# Read the blob as bytes
|
||||||
|
print(blob.as_bytes())
|
||||||
|
|
||||||
|
# Read the blob as a byte stream
|
||||||
|
with blob.as_bytes_io() as f:
|
||||||
|
print(f.read())
|
||||||
|
|
||||||
|
Example: Load from memory and specify mime-type and metadata
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_core.documents import Blob
|
||||||
|
|
||||||
|
blob = Blob.from_data(
|
||||||
|
data="Hello, world!",
|
||||||
|
mime_type="text/plain",
|
||||||
|
metadata={"source": "https://example.com"}
|
||||||
|
)
|
||||||
|
|
||||||
|
Example: Load the blob from a file
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_core.documents import Blob
|
||||||
|
|
||||||
|
blob = Blob.from_path("path/to/file.txt")
|
||||||
|
|
||||||
|
# Read the blob as a string
|
||||||
|
print(blob.as_string())
|
||||||
|
|
||||||
|
# Read the blob as bytes
|
||||||
|
print(blob.as_bytes())
|
||||||
|
|
||||||
|
# Read the blob as a byte stream
|
||||||
|
with blob.as_bytes_io() as f:
|
||||||
|
print(f.read())
|
||||||
|
"""
|
||||||
|
|
||||||
|
data: Union[bytes, str, None]
|
||||||
|
"""Raw data associated with the blob."""
|
||||||
|
mimetype: Optional[str] = None
|
||||||
|
"""MimeType not to be confused with a file extension."""
|
||||||
|
encoding: str = "utf-8"
|
||||||
|
"""Encoding to use if decoding the bytes into a string.
|
||||||
|
|
||||||
|
Use utf-8 as default encoding, if decoding to string.
|
||||||
|
"""
|
||||||
|
path: Optional[PathLike] = None
|
||||||
|
"""Location where the original content was found."""
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
frozen = True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def source(self) -> Optional[str]:
|
||||||
|
"""The source location of the blob as string if known otherwise none.
|
||||||
|
|
||||||
|
If a path is associated with the blob, it will default to the path location.
|
||||||
|
|
||||||
|
Unless explicitly set via a metadata field called "source", in which
|
||||||
|
case that value will be used instead.
|
||||||
|
"""
|
||||||
|
if self.metadata and "source" in self.metadata:
|
||||||
|
return cast(Optional[str], self.metadata["source"])
|
||||||
|
return str(self.path) if self.path else None
|
||||||
|
|
||||||
|
@root_validator(pre=True)
|
||||||
|
def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||||
|
"""Verify that either data or path is provided."""
|
||||||
|
if "data" not in values and "path" not in values:
|
||||||
|
raise ValueError("Either data or path must be provided")
|
||||||
|
return values
|
||||||
|
|
||||||
|
def as_string(self) -> str:
|
||||||
|
"""Read data as a string."""
|
||||||
|
if self.data is None and self.path:
|
||||||
|
with open(str(self.path), "r", encoding=self.encoding) as f:
|
||||||
|
return f.read()
|
||||||
|
elif isinstance(self.data, bytes):
|
||||||
|
return self.data.decode(self.encoding)
|
||||||
|
elif isinstance(self.data, str):
|
||||||
|
return self.data
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unable to get string for blob {self}")
|
||||||
|
|
||||||
|
def as_bytes(self) -> bytes:
|
||||||
|
"""Read data as bytes."""
|
||||||
|
if isinstance(self.data, bytes):
|
||||||
|
return self.data
|
||||||
|
elif isinstance(self.data, str):
|
||||||
|
return self.data.encode(self.encoding)
|
||||||
|
elif self.data is None and self.path:
|
||||||
|
with open(str(self.path), "rb") as f:
|
||||||
|
return f.read()
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unable to get bytes for blob {self}")
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
|
||||||
|
"""Read data as a byte stream."""
|
||||||
|
if isinstance(self.data, bytes):
|
||||||
|
yield BytesIO(self.data)
|
||||||
|
elif self.data is None and self.path:
|
||||||
|
with open(str(self.path), "rb") as f:
|
||||||
|
yield f
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Unable to convert blob {self}")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_path(
|
||||||
|
cls,
|
||||||
|
path: PathLike,
|
||||||
|
*,
|
||||||
|
encoding: str = "utf-8",
|
||||||
|
mime_type: Optional[str] = None,
|
||||||
|
guess_type: bool = True,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
|
) -> Blob:
|
||||||
|
"""Load the blob from a path like object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: path like object to file to be read
|
||||||
|
encoding: Encoding to use if decoding the bytes into a string
|
||||||
|
mime_type: if provided, will be set as the mime-type of the data
|
||||||
|
guess_type: If True, the mimetype will be guessed from the file extension,
|
||||||
|
if a mime-type was not provided
|
||||||
|
metadata: Metadata to associate with the blob
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Blob instance
|
||||||
|
"""
|
||||||
|
if mime_type is None and guess_type:
|
||||||
|
_mimetype = mimetypes.guess_type(path)[0] if guess_type else None
|
||||||
|
else:
|
||||||
|
_mimetype = mime_type
|
||||||
|
# We do not load the data immediately, instead we treat the blob as a
|
||||||
|
# reference to the underlying data.
|
||||||
|
return cls(
|
||||||
|
data=None,
|
||||||
|
mimetype=_mimetype,
|
||||||
|
encoding=encoding,
|
||||||
|
path=path,
|
||||||
|
metadata=metadata if metadata is not None else {},
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_data(
|
||||||
|
cls,
|
||||||
|
data: Union[str, bytes],
|
||||||
|
*,
|
||||||
|
encoding: str = "utf-8",
|
||||||
|
mime_type: Optional[str] = None,
|
||||||
|
path: Optional[str] = None,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
|
) -> Blob:
|
||||||
|
"""Initialize the blob from in-memory data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: the in-memory data associated with the blob
|
||||||
|
encoding: Encoding to use if decoding the bytes into a string
|
||||||
|
mime_type: if provided, will be set as the mime-type of the data
|
||||||
|
path: if provided, will be set as the source from which the data came
|
||||||
|
metadata: Metadata to associate with the blob
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Blob instance
|
||||||
|
"""
|
||||||
|
return cls(
|
||||||
|
data=data,
|
||||||
|
mimetype=mime_type,
|
||||||
|
encoding=encoding,
|
||||||
|
path=path,
|
||||||
|
metadata=metadata if metadata is not None else {},
|
||||||
|
)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
"""Define the blob representation."""
|
||||||
|
str_repr = f"Blob {id(self)}"
|
||||||
|
if self.source:
|
||||||
|
str_repr += f" {self.source}"
|
||||||
|
return str_repr
|
||||||
|
|
||||||
|
|
||||||
|
class Document(BaseMedia):
|
||||||
"""Class for storing a piece of text and associated metadata.
|
"""Class for storing a piece of text and associated metadata.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
@ -21,27 +259,15 @@ class Document(Serializable):
|
|||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# The ID field is optional at the moment.
|
|
||||||
# It will likely become required in a future major release after
|
|
||||||
# it has been adopted by enough vectorstore implementations.
|
|
||||||
id: Optional[str] = None
|
|
||||||
"""An optional identifier for the document.
|
|
||||||
|
|
||||||
Ideally this should be unique across the document collection and formatted
|
|
||||||
as a UUID, but this will not be enforced.
|
|
||||||
"""
|
|
||||||
|
|
||||||
page_content: str
|
page_content: str
|
||||||
"""String text."""
|
"""String text."""
|
||||||
metadata: dict = Field(default_factory=dict)
|
|
||||||
"""Arbitrary metadata about the page content (e.g., source, relationships to other
|
|
||||||
documents, etc.).
|
|
||||||
"""
|
|
||||||
type: Literal["Document"] = "Document"
|
type: Literal["Document"] = "Document"
|
||||||
|
|
||||||
def __init__(self, page_content: str, **kwargs: Any) -> None:
|
def __init__(self, page_content: str, **kwargs: Any) -> None:
|
||||||
"""Pass page_content in as positional or named arg."""
|
"""Pass page_content in as positional or named arg."""
|
||||||
super().__init__(page_content=page_content, **kwargs)
|
# my-py is complaining that page_content is not defined on the base class.
|
||||||
|
# Here, we're relying on pydantic base class to handle the validation.
|
||||||
|
super().__init__(page_content=page_content, **kwargs) # type: ignore[call-arg]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def is_lc_serializable(cls) -> bool:
|
def is_lc_serializable(cls) -> bool:
|
||||||
|
@ -4,8 +4,8 @@ from typing import Iterator, List
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from langchain_core.document_loaders.base import BaseBlobParser, BaseLoader
|
from langchain_core.document_loaders.base import BaseBlobParser, BaseLoader
|
||||||
from langchain_core.document_loaders.blob_loaders import Blob
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.documents.base import Blob
|
||||||
|
|
||||||
|
|
||||||
def test_base_blob_parser() -> None:
|
def test_base_blob_parser() -> None:
|
||||||
|
@ -16,5 +16,5 @@ def test_repr() -> None:
|
|||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
repr(Document(page_content="Hello, World!", metadata={"a": 3}))
|
repr(Document(page_content="Hello, World!", metadata={"a": 3}))
|
||||||
== "Document(page_content='Hello, World!', metadata={'a': 3})"
|
== "Document(metadata={'a': 3}, page_content='Hello, World!')"
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user