Add metadata to blob (#14162)

Add metadata to the blob object. This makes it easier
to make a pipeline that properly propagates metadata information
from raw content to the derived content.
This commit is contained in:
Eugene Yurtsev 2023-12-05 17:17:41 -05:00 committed by GitHub
parent 66848871fc
commit a74c03da3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 59 additions and 12 deletions

View File

@ -11,9 +11,9 @@ import mimetypes
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from io import BufferedReader, BytesIO from io import BufferedReader, BytesIO
from pathlib import PurePath from pathlib import PurePath
from typing import Any, Generator, Iterable, Mapping, Optional, Union from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Union, cast
from langchain_core.pydantic_v1 import BaseModel, root_validator from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
PathLike = Union[str, PurePath] PathLike = Union[str, PurePath]
@ -28,14 +28,20 @@ class Blob(BaseModel):
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
""" """
data: Union[bytes, str, None] # Raw data data: Union[bytes, str, None]
mimetype: Optional[str] = None # Not to be confused with a file extension """Raw data associated with the blob."""
encoding: str = "utf-8" # Use utf-8 as default encoding, if decoding to string mimetype: Optional[str] = None
# Location where the original content was found """MimeType not to be confused with a file extension."""
# Represent location on the local file system encoding: str = "utf-8"
# Useful for situations where downstream code assumes it must work with file paths """Encoding to use if decoding the bytes into a string.
# rather than in-memory content.
Use utf-8 as default encoding, if decoding to string.
"""
path: Optional[PathLike] = None path: Optional[PathLike] = None
"""Location where the original content was found."""
metadata: Dict[str, Any] = Field(default_factory=dict)
"""Metadata about the blob (e.g., source)"""
class Config: class Config:
arbitrary_types_allowed = True arbitrary_types_allowed = True
@ -43,7 +49,15 @@ class Blob(BaseModel):
@property @property
def source(self) -> Optional[str]: def source(self) -> Optional[str]:
"""The source location of the blob as string if known otherwise none.""" """The source location of the blob as string if known otherwise none.
If a path is associated with the blob, it will default to the path location.
Unless explicitly set via a metadata field called "source", in which
case that value will be used instead.
"""
if self.metadata and "source" in self.metadata:
return cast(Optional[str], self.metadata["source"])
return str(self.path) if self.path else None return str(self.path) if self.path else None
@root_validator(pre=True) @root_validator(pre=True)
@ -96,6 +110,7 @@ class Blob(BaseModel):
encoding: str = "utf-8", encoding: str = "utf-8",
mime_type: Optional[str] = None, mime_type: Optional[str] = None,
guess_type: bool = True, guess_type: bool = True,
metadata: Optional[dict] = None,
) -> Blob: ) -> Blob:
"""Load the blob from a path like object. """Load the blob from a path like object.
@ -105,6 +120,7 @@ class Blob(BaseModel):
mime_type: if provided, will be set as the mime-type of the data mime_type: if provided, will be set as the mime-type of the data
guess_type: If True, the mimetype will be guessed from the file extension, guess_type: If True, the mimetype will be guessed from the file extension,
if a mime-type was not provided if a mime-type was not provided
metadata: Metadata to associate with the blob
Returns: Returns:
Blob instance Blob instance
@ -115,7 +131,13 @@ class Blob(BaseModel):
_mimetype = mime_type _mimetype = mime_type
# We do not load the data immediately, instead we treat the blob as a # We do not load the data immediately, instead we treat the blob as a
# reference to the underlying data. # reference to the underlying data.
return cls(data=None, mimetype=_mimetype, encoding=encoding, path=path) return cls(
data=None,
mimetype=_mimetype,
encoding=encoding,
path=path,
metadata=metadata if metadata is not None else {},
)
@classmethod @classmethod
def from_data( def from_data(
@ -125,6 +147,7 @@ class Blob(BaseModel):
encoding: str = "utf-8", encoding: str = "utf-8",
mime_type: Optional[str] = None, mime_type: Optional[str] = None,
path: Optional[str] = None, path: Optional[str] = None,
metadata: Optional[dict] = None,
) -> Blob: ) -> Blob:
"""Initialize the blob from in-memory data. """Initialize the blob from in-memory data.
@ -133,11 +156,18 @@ class Blob(BaseModel):
encoding: Encoding to use if decoding the bytes into a string encoding: Encoding to use if decoding the bytes into a string
mime_type: if provided, will be set as the mime-type of the data mime_type: if provided, will be set as the mime-type of the data
path: if provided, will be set as the source from which the data came path: if provided, will be set as the source from which the data came
metadata: Metadata to associate with the blob
Returns: Returns:
Blob instance Blob instance
""" """
return cls(data=data, mimetype=mime_type, encoding=encoding, path=path) return cls(
data=data,
mimetype=mime_type,
encoding=encoding,
path=path,
metadata=metadata if metadata is not None else {},
)
def __repr__(self) -> str: def __repr__(self) -> str:
"""Define the blob representation.""" """Define the blob representation."""

View File

@ -131,3 +131,20 @@ def test_blob_loader() -> None:
yield Blob(data=b"Hello, World!") yield Blob(data=b"Hello, World!")
assert list(TestLoader().yield_blobs()) == [Blob(data=b"Hello, World!")] assert list(TestLoader().yield_blobs()) == [Blob(data=b"Hello, World!")]
def test_metadata_and_source() -> None:
"""Test metadata and source"""
blob = Blob(path="some_file", data="b")
assert blob.source == "some_file"
assert blob.metadata == {}
blob = Blob(data=b"", metadata={"source": "hello"})
assert blob.source == "hello"
assert blob.metadata == {"source": "hello"}
blob = Blob.from_data("data", metadata={"source": "somewhere"})
assert blob.source == "somewhere"
with get_temp_file(b"hello") as path:
blob = Blob.from_path(path, metadata={"source": "somewhere"})
assert blob.source == "somewhere"