mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-23 16:08:10 +00:00
Add metadata to blob (#14162)
Add metadata to the blob object. This makes it easier to make a pipeline that properly propagates metadata information from raw content to the derived content.
This commit is contained in:
parent
66848871fc
commit
a74c03da3c
@ -11,9 +11,9 @@ import mimetypes
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from io import BufferedReader, BytesIO
|
from io import BufferedReader, BytesIO
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Any, Generator, Iterable, Mapping, Optional, Union
|
from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Union, cast
|
||||||
|
|
||||||
from langchain_core.pydantic_v1 import BaseModel, root_validator
|
from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
|
||||||
|
|
||||||
PathLike = Union[str, PurePath]
|
PathLike = Union[str, PurePath]
|
||||||
|
|
||||||
@ -28,14 +28,20 @@ class Blob(BaseModel):
|
|||||||
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
|
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
|
||||||
"""
|
"""
|
||||||
|
|
||||||
data: Union[bytes, str, None] # Raw data
|
data: Union[bytes, str, None]
|
||||||
mimetype: Optional[str] = None # Not to be confused with a file extension
|
"""Raw data associated with the blob."""
|
||||||
encoding: str = "utf-8" # Use utf-8 as default encoding, if decoding to string
|
mimetype: Optional[str] = None
|
||||||
# Location where the original content was found
|
"""MimeType not to be confused with a file extension."""
|
||||||
# Represent location on the local file system
|
encoding: str = "utf-8"
|
||||||
# Useful for situations where downstream code assumes it must work with file paths
|
"""Encoding to use if decoding the bytes into a string.
|
||||||
# rather than in-memory content.
|
|
||||||
|
Use utf-8 as default encoding, if decoding to string.
|
||||||
|
"""
|
||||||
path: Optional[PathLike] = None
|
path: Optional[PathLike] = None
|
||||||
|
"""Location where the original content was found."""
|
||||||
|
|
||||||
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
"""Metadata about the blob (e.g., source)"""
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
@ -43,7 +49,15 @@ class Blob(BaseModel):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def source(self) -> Optional[str]:
|
def source(self) -> Optional[str]:
|
||||||
"""The source location of the blob as string if known otherwise none."""
|
"""The source location of the blob as string if known otherwise none.
|
||||||
|
|
||||||
|
If a path is associated with the blob, it will default to the path location.
|
||||||
|
|
||||||
|
Unless explicitly set via a metadata field called "source", in which
|
||||||
|
case that value will be used instead.
|
||||||
|
"""
|
||||||
|
if self.metadata and "source" in self.metadata:
|
||||||
|
return cast(Optional[str], self.metadata["source"])
|
||||||
return str(self.path) if self.path else None
|
return str(self.path) if self.path else None
|
||||||
|
|
||||||
@root_validator(pre=True)
|
@root_validator(pre=True)
|
||||||
@ -96,6 +110,7 @@ class Blob(BaseModel):
|
|||||||
encoding: str = "utf-8",
|
encoding: str = "utf-8",
|
||||||
mime_type: Optional[str] = None,
|
mime_type: Optional[str] = None,
|
||||||
guess_type: bool = True,
|
guess_type: bool = True,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
) -> Blob:
|
) -> Blob:
|
||||||
"""Load the blob from a path like object.
|
"""Load the blob from a path like object.
|
||||||
|
|
||||||
@ -105,6 +120,7 @@ class Blob(BaseModel):
|
|||||||
mime_type: if provided, will be set as the mime-type of the data
|
mime_type: if provided, will be set as the mime-type of the data
|
||||||
guess_type: If True, the mimetype will be guessed from the file extension,
|
guess_type: If True, the mimetype will be guessed from the file extension,
|
||||||
if a mime-type was not provided
|
if a mime-type was not provided
|
||||||
|
metadata: Metadata to associate with the blob
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Blob instance
|
Blob instance
|
||||||
@ -115,7 +131,13 @@ class Blob(BaseModel):
|
|||||||
_mimetype = mime_type
|
_mimetype = mime_type
|
||||||
# We do not load the data immediately, instead we treat the blob as a
|
# We do not load the data immediately, instead we treat the blob as a
|
||||||
# reference to the underlying data.
|
# reference to the underlying data.
|
||||||
return cls(data=None, mimetype=_mimetype, encoding=encoding, path=path)
|
return cls(
|
||||||
|
data=None,
|
||||||
|
mimetype=_mimetype,
|
||||||
|
encoding=encoding,
|
||||||
|
path=path,
|
||||||
|
metadata=metadata if metadata is not None else {},
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_data(
|
def from_data(
|
||||||
@ -125,6 +147,7 @@ class Blob(BaseModel):
|
|||||||
encoding: str = "utf-8",
|
encoding: str = "utf-8",
|
||||||
mime_type: Optional[str] = None,
|
mime_type: Optional[str] = None,
|
||||||
path: Optional[str] = None,
|
path: Optional[str] = None,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
) -> Blob:
|
) -> Blob:
|
||||||
"""Initialize the blob from in-memory data.
|
"""Initialize the blob from in-memory data.
|
||||||
|
|
||||||
@ -133,11 +156,18 @@ class Blob(BaseModel):
|
|||||||
encoding: Encoding to use if decoding the bytes into a string
|
encoding: Encoding to use if decoding the bytes into a string
|
||||||
mime_type: if provided, will be set as the mime-type of the data
|
mime_type: if provided, will be set as the mime-type of the data
|
||||||
path: if provided, will be set as the source from which the data came
|
path: if provided, will be set as the source from which the data came
|
||||||
|
metadata: Metadata to associate with the blob
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Blob instance
|
Blob instance
|
||||||
"""
|
"""
|
||||||
return cls(data=data, mimetype=mime_type, encoding=encoding, path=path)
|
return cls(
|
||||||
|
data=data,
|
||||||
|
mimetype=mime_type,
|
||||||
|
encoding=encoding,
|
||||||
|
path=path,
|
||||||
|
metadata=metadata if metadata is not None else {},
|
||||||
|
)
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
"""Define the blob representation."""
|
"""Define the blob representation."""
|
||||||
|
@ -131,3 +131,20 @@ def test_blob_loader() -> None:
|
|||||||
yield Blob(data=b"Hello, World!")
|
yield Blob(data=b"Hello, World!")
|
||||||
|
|
||||||
assert list(TestLoader().yield_blobs()) == [Blob(data=b"Hello, World!")]
|
assert list(TestLoader().yield_blobs()) == [Blob(data=b"Hello, World!")]
|
||||||
|
|
||||||
|
|
||||||
|
def test_metadata_and_source() -> None:
|
||||||
|
"""Test metadata and source"""
|
||||||
|
blob = Blob(path="some_file", data="b")
|
||||||
|
assert blob.source == "some_file"
|
||||||
|
assert blob.metadata == {}
|
||||||
|
blob = Blob(data=b"", metadata={"source": "hello"})
|
||||||
|
assert blob.source == "hello"
|
||||||
|
assert blob.metadata == {"source": "hello"}
|
||||||
|
|
||||||
|
blob = Blob.from_data("data", metadata={"source": "somewhere"})
|
||||||
|
assert blob.source == "somewhere"
|
||||||
|
|
||||||
|
with get_temp_file(b"hello") as path:
|
||||||
|
blob = Blob.from_path(path, metadata={"source": "somewhere"})
|
||||||
|
assert blob.source == "somewhere"
|
||||||
|
Loading…
Reference in New Issue
Block a user