From ea141511d896ab52b49cca097f85bb5e1f18846b Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Wed, 6 Mar 2024 19:59:00 +0100 Subject: [PATCH] core: Move document loader interfaces to core (#17723) This is needed to be able to move document loaders to partner packages. --------- Co-authored-by: Eugene Yurtsev --- .../document_loaders/base.py | 123 +---------- .../document_loaders/blob_loaders/schema.py | 200 +----------------- .../document_loaders/__init__.py | 10 + .../langchain_core/document_loaders/base.py | 119 +++++++++++ .../document_loaders/blob_loaders.py | 195 +++++++++++++++++ libs/core/poetry.lock | 21 +- libs/core/pyproject.toml | 1 + 7 files changed, 356 insertions(+), 313 deletions(-) create mode 100644 libs/core/langchain_core/document_loaders/__init__.py create mode 100644 libs/core/langchain_core/document_loaders/base.py create mode 100644 libs/core/langchain_core/document_loaders/blob_loaders.py diff --git a/libs/community/langchain_community/document_loaders/base.py b/libs/community/langchain_community/document_loaders/base.py index 8af5e48fe21..a1c3d82524f 100644 --- a/libs/community/langchain_community/document_loaders/base.py +++ b/libs/community/langchain_community/document_loaders/base.py @@ -1,119 +1,6 @@ -"""Abstract interface for document loader implementations.""" -from __future__ import annotations +from langchain_core.document_loaders import BaseBlobParser, BaseLoader -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Optional - -from langchain_core.documents import Document -from langchain_core.runnables import run_in_executor - -if TYPE_CHECKING: - from langchain_text_splitters import TextSplitter - -from langchain_community.document_loaders.blob_loaders import Blob - - -class BaseLoader(ABC): - """Interface for Document Loader. - - Implementations should implement the lazy-loading method using generators - to avoid loading all Documents into memory at once. - - `load` is provided just for user convenience and should not be overridden. - """ - - # Sub-classes should not implement this method directly. Instead, they - # should implement the lazy load method. - def load(self) -> List[Document]: - """Load data into Document objects.""" - return list(self.lazy_load()) - - def load_and_split( - self, text_splitter: Optional[TextSplitter] = None - ) -> List[Document]: - """Load Documents and split into chunks. Chunks are returned as Documents. - - Do not override this method. It should be considered to be deprecated! - - Args: - text_splitter: TextSplitter instance to use for splitting documents. - Defaults to RecursiveCharacterTextSplitter. - - Returns: - List of Documents. - """ - - if text_splitter is None: - try: - from langchain_text_splitters import RecursiveCharacterTextSplitter - except ImportError as e: - raise ImportError( - "Unable to import from langchain_text_splitters. Please specify " - "text_splitter or install langchain_text_splitters with " - "`pip install -U langchain-text-splitters`." - ) from e - - _text_splitter: TextSplitter = RecursiveCharacterTextSplitter() - else: - _text_splitter = text_splitter - docs = self.load() - return _text_splitter.split_documents(docs) - - # Attention: This method will be upgraded into an abstractmethod once it's - # implemented in all the existing subclasses. - def lazy_load(self) -> Iterator[Document]: - """A lazy loader for Documents.""" - raise NotImplementedError( - f"{self.__class__.__name__} does not implement lazy_load()" - ) - - async def alazy_load(self) -> AsyncIterator[Document]: - """A lazy loader for Documents.""" - iterator = await run_in_executor(None, self.lazy_load) - done = object() - while True: - doc = await run_in_executor(None, next, iterator, done) # type: ignore[call-arg, arg-type] - if doc is done: - break - yield doc # type: ignore[misc] - - -class BaseBlobParser(ABC): - """Abstract interface for blob parsers. - - A blob parser provides a way to parse raw data stored in a blob into one - or more documents. - - The parser can be composed with blob loaders, making it easy to reuse - a parser independent of how the blob was originally loaded. - """ - - @abstractmethod - def lazy_parse(self, blob: Blob) -> Iterator[Document]: - """Lazy parsing interface. - - Subclasses are required to implement this method. - - Args: - blob: Blob instance - - Returns: - Generator of documents - """ - - def parse(self, blob: Blob) -> List[Document]: - """Eagerly parse the blob into a document or documents. - - This is a convenience method for interactive development environment. - - Production applications should favor the lazy_parse method instead. - - Subclasses should generally not over-ride this parse method. - - Args: - blob: Blob instance - - Returns: - List of documents - """ - return list(self.lazy_parse(blob)) +__all__ = [ + "BaseBlobParser", + "BaseLoader", +] diff --git a/libs/community/langchain_community/document_loaders/blob_loaders/schema.py b/libs/community/langchain_community/document_loaders/blob_loaders/schema.py index c2f88a14015..208510eaeac 100644 --- a/libs/community/langchain_community/document_loaders/blob_loaders/schema.py +++ b/libs/community/langchain_community/document_loaders/blob_loaders/schema.py @@ -1,195 +1,7 @@ -"""Schema for Blobs and Blob Loaders. +from langchain_core.document_loaders.blob_loaders import Blob, BlobLoader, PathLike -The goal is to facilitate decoupling of content loading from content parsing code. - -In addition, content loading code should provide a lazy loading interface by default. -""" -from __future__ import annotations - -import contextlib -import mimetypes -from abc import ABC, abstractmethod -from io import BufferedReader, BytesIO -from pathlib import PurePath -from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Union, cast - -from langchain_core.pydantic_v1 import BaseModel, Field, root_validator - -PathLike = Union[str, PurePath] - - -class Blob(BaseModel): - """Blob represents raw data by either reference or value. - - Provides an interface to materialize the blob in different representations, and - help to decouple the development of data loaders from the downstream parsing of - the raw data. - - Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob - """ - - data: Union[bytes, str, None] - """Raw data associated with the blob.""" - mimetype: Optional[str] = None - """MimeType not to be confused with a file extension.""" - encoding: str = "utf-8" - """Encoding to use if decoding the bytes into a string. - - Use utf-8 as default encoding, if decoding to string. - """ - path: Optional[PathLike] = None - """Location where the original content was found.""" - - metadata: Dict[str, Any] = Field(default_factory=dict) - """Metadata about the blob (e.g., source)""" - - class Config: - arbitrary_types_allowed = True - frozen = True - - @property - def source(self) -> Optional[str]: - """The source location of the blob as string if known otherwise none. - - If a path is associated with the blob, it will default to the path location. - - Unless explicitly set via a metadata field called "source", in which - case that value will be used instead. - """ - if self.metadata and "source" in self.metadata: - return cast(Optional[str], self.metadata["source"]) - return str(self.path) if self.path else None - - @root_validator(pre=True) - def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]: - """Verify that either data or path is provided.""" - if "data" not in values and "path" not in values: - raise ValueError("Either data or path must be provided") - return values - - def as_string(self) -> str: - """Read data as a string.""" - if self.data is None and self.path: - with open(str(self.path), "r", encoding=self.encoding) as f: - return f.read() - elif isinstance(self.data, bytes): - return self.data.decode(self.encoding) - elif isinstance(self.data, str): - return self.data - else: - raise ValueError(f"Unable to get string for blob {self}") - - def as_bytes(self) -> bytes: - """Read data as bytes.""" - if isinstance(self.data, bytes): - return self.data - elif isinstance(self.data, str): - return self.data.encode(self.encoding) - elif self.data is None and self.path: - with open(str(self.path), "rb") as f: - return f.read() - else: - raise ValueError(f"Unable to get bytes for blob {self}") - - @contextlib.contextmanager - def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]: - """Read data as a byte stream.""" - if isinstance(self.data, bytes): - yield BytesIO(self.data) - elif self.data is None and self.path: - with open(str(self.path), "rb") as f: - yield f - else: - raise NotImplementedError(f"Unable to convert blob {self}") - - @classmethod - def from_path( - cls, - path: PathLike, - *, - encoding: str = "utf-8", - mime_type: Optional[str] = None, - guess_type: bool = True, - metadata: Optional[dict] = None, - ) -> Blob: - """Load the blob from a path like object. - - Args: - path: path like object to file to be read - encoding: Encoding to use if decoding the bytes into a string - mime_type: if provided, will be set as the mime-type of the data - guess_type: If True, the mimetype will be guessed from the file extension, - if a mime-type was not provided - metadata: Metadata to associate with the blob - - Returns: - Blob instance - """ - if mime_type is None and guess_type: - _mimetype = mimetypes.guess_type(path)[0] if guess_type else None - else: - _mimetype = mime_type - # We do not load the data immediately, instead we treat the blob as a - # reference to the underlying data. - return cls( - data=None, - mimetype=_mimetype, - encoding=encoding, - path=path, - metadata=metadata if metadata is not None else {}, - ) - - @classmethod - def from_data( - cls, - data: Union[str, bytes], - *, - encoding: str = "utf-8", - mime_type: Optional[str] = None, - path: Optional[str] = None, - metadata: Optional[dict] = None, - ) -> Blob: - """Initialize the blob from in-memory data. - - Args: - data: the in-memory data associated with the blob - encoding: Encoding to use if decoding the bytes into a string - mime_type: if provided, will be set as the mime-type of the data - path: if provided, will be set as the source from which the data came - metadata: Metadata to associate with the blob - - Returns: - Blob instance - """ - return cls( - data=data, - mimetype=mime_type, - encoding=encoding, - path=path, - metadata=metadata if metadata is not None else {}, - ) - - def __repr__(self) -> str: - """Define the blob representation.""" - str_repr = f"Blob {id(self)}" - if self.source: - str_repr += f" {self.source}" - return str_repr - - -class BlobLoader(ABC): - """Abstract interface for blob loaders implementation. - - Implementer should be able to load raw content from a storage system according - to some criteria and return the raw content lazily as a stream of blobs. - """ - - @abstractmethod - def yield_blobs( - self, - ) -> Iterable[Blob]: - """A lazy loader for raw data represented by LangChain's Blob object. - - Returns: - A generator over blobs - """ +__all__ = [ + "Blob", + "BlobLoader", + "PathLike", +] diff --git a/libs/core/langchain_core/document_loaders/__init__.py b/libs/core/langchain_core/document_loaders/__init__.py new file mode 100644 index 00000000000..05a48a9be01 --- /dev/null +++ b/libs/core/langchain_core/document_loaders/__init__.py @@ -0,0 +1,10 @@ +from langchain_core.document_loaders.base import BaseBlobParser, BaseLoader +from langchain_core.document_loaders.blob_loaders import Blob, BlobLoader, PathLike + +__all__ = [ + "BaseBlobParser", + "BaseLoader", + "Blob", + "BlobLoader", + "PathLike", +] diff --git a/libs/core/langchain_core/document_loaders/base.py b/libs/core/langchain_core/document_loaders/base.py new file mode 100644 index 00000000000..187123a3e39 --- /dev/null +++ b/libs/core/langchain_core/document_loaders/base.py @@ -0,0 +1,119 @@ +"""Abstract interface for document loader implementations.""" +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Optional + +from langchain_core.documents import Document +from langchain_core.runnables import run_in_executor + +if TYPE_CHECKING: + from langchain_text_splitters import TextSplitter + +from langchain_core.document_loaders.blob_loaders import Blob + + +class BaseLoader(ABC): + """Interface for Document Loader. + + Implementations should implement the lazy-loading method using generators + to avoid loading all Documents into memory at once. + + `load` is provided just for user convenience and should not be overridden. + """ + + # Sub-classes should not implement this method directly. Instead, they + # should implement the lazy load method. + def load(self) -> List[Document]: + """Load data into Document objects.""" + return list(self.lazy_load()) + + def load_and_split( + self, text_splitter: Optional[TextSplitter] = None + ) -> List[Document]: + """Load Documents and split into chunks. Chunks are returned as Documents. + + Do not override this method. It should be considered to be deprecated! + + Args: + text_splitter: TextSplitter instance to use for splitting documents. + Defaults to RecursiveCharacterTextSplitter. + + Returns: + List of Documents. + """ + + if text_splitter is None: + try: + from langchain_text_splitters import RecursiveCharacterTextSplitter + except ImportError as e: + raise ImportError( + "Unable to import from langchain_text_splitters. Please specify " + "text_splitter or install langchain_text_splitters with " + "`pip install -U langchain-text-splitters`." + ) from e + + _text_splitter: TextSplitter = RecursiveCharacterTextSplitter() + else: + _text_splitter = text_splitter + docs = self.load() + return _text_splitter.split_documents(docs) + + # Attention: This method will be upgraded into an abstractmethod once it's + # implemented in all the existing subclasses. + def lazy_load(self) -> Iterator[Document]: + """A lazy loader for Documents.""" + raise NotImplementedError( + f"{self.__class__.__name__} does not implement lazy_load()" + ) + + async def alazy_load(self) -> AsyncIterator[Document]: + """A lazy loader for Documents.""" + iterator = await run_in_executor(None, self.lazy_load) + done = object() + while True: + doc = await run_in_executor(None, next, iterator, done) # type: ignore[call-arg, arg-type] + if doc is done: + break + yield doc # type: ignore[misc] + + +class BaseBlobParser(ABC): + """Abstract interface for blob parsers. + + A blob parser provides a way to parse raw data stored in a blob into one + or more documents. + + The parser can be composed with blob loaders, making it easy to reuse + a parser independent of how the blob was originally loaded. + """ + + @abstractmethod + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazy parsing interface. + + Subclasses are required to implement this method. + + Args: + blob: Blob instance + + Returns: + Generator of documents + """ + + def parse(self, blob: Blob) -> List[Document]: + """Eagerly parse the blob into a document or documents. + + This is a convenience method for interactive development environment. + + Production applications should favor the lazy_parse method instead. + + Subclasses should generally not over-ride this parse method. + + Args: + blob: Blob instance + + Returns: + List of documents + """ + return list(self.lazy_parse(blob)) diff --git a/libs/core/langchain_core/document_loaders/blob_loaders.py b/libs/core/langchain_core/document_loaders/blob_loaders.py new file mode 100644 index 00000000000..c2f88a14015 --- /dev/null +++ b/libs/core/langchain_core/document_loaders/blob_loaders.py @@ -0,0 +1,195 @@ +"""Schema for Blobs and Blob Loaders. + +The goal is to facilitate decoupling of content loading from content parsing code. + +In addition, content loading code should provide a lazy loading interface by default. +""" +from __future__ import annotations + +import contextlib +import mimetypes +from abc import ABC, abstractmethod +from io import BufferedReader, BytesIO +from pathlib import PurePath +from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Union, cast + +from langchain_core.pydantic_v1 import BaseModel, Field, root_validator + +PathLike = Union[str, PurePath] + + +class Blob(BaseModel): + """Blob represents raw data by either reference or value. + + Provides an interface to materialize the blob in different representations, and + help to decouple the development of data loaders from the downstream parsing of + the raw data. + + Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob + """ + + data: Union[bytes, str, None] + """Raw data associated with the blob.""" + mimetype: Optional[str] = None + """MimeType not to be confused with a file extension.""" + encoding: str = "utf-8" + """Encoding to use if decoding the bytes into a string. + + Use utf-8 as default encoding, if decoding to string. + """ + path: Optional[PathLike] = None + """Location where the original content was found.""" + + metadata: Dict[str, Any] = Field(default_factory=dict) + """Metadata about the blob (e.g., source)""" + + class Config: + arbitrary_types_allowed = True + frozen = True + + @property + def source(self) -> Optional[str]: + """The source location of the blob as string if known otherwise none. + + If a path is associated with the blob, it will default to the path location. + + Unless explicitly set via a metadata field called "source", in which + case that value will be used instead. + """ + if self.metadata and "source" in self.metadata: + return cast(Optional[str], self.metadata["source"]) + return str(self.path) if self.path else None + + @root_validator(pre=True) + def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]: + """Verify that either data or path is provided.""" + if "data" not in values and "path" not in values: + raise ValueError("Either data or path must be provided") + return values + + def as_string(self) -> str: + """Read data as a string.""" + if self.data is None and self.path: + with open(str(self.path), "r", encoding=self.encoding) as f: + return f.read() + elif isinstance(self.data, bytes): + return self.data.decode(self.encoding) + elif isinstance(self.data, str): + return self.data + else: + raise ValueError(f"Unable to get string for blob {self}") + + def as_bytes(self) -> bytes: + """Read data as bytes.""" + if isinstance(self.data, bytes): + return self.data + elif isinstance(self.data, str): + return self.data.encode(self.encoding) + elif self.data is None and self.path: + with open(str(self.path), "rb") as f: + return f.read() + else: + raise ValueError(f"Unable to get bytes for blob {self}") + + @contextlib.contextmanager + def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]: + """Read data as a byte stream.""" + if isinstance(self.data, bytes): + yield BytesIO(self.data) + elif self.data is None and self.path: + with open(str(self.path), "rb") as f: + yield f + else: + raise NotImplementedError(f"Unable to convert blob {self}") + + @classmethod + def from_path( + cls, + path: PathLike, + *, + encoding: str = "utf-8", + mime_type: Optional[str] = None, + guess_type: bool = True, + metadata: Optional[dict] = None, + ) -> Blob: + """Load the blob from a path like object. + + Args: + path: path like object to file to be read + encoding: Encoding to use if decoding the bytes into a string + mime_type: if provided, will be set as the mime-type of the data + guess_type: If True, the mimetype will be guessed from the file extension, + if a mime-type was not provided + metadata: Metadata to associate with the blob + + Returns: + Blob instance + """ + if mime_type is None and guess_type: + _mimetype = mimetypes.guess_type(path)[0] if guess_type else None + else: + _mimetype = mime_type + # We do not load the data immediately, instead we treat the blob as a + # reference to the underlying data. + return cls( + data=None, + mimetype=_mimetype, + encoding=encoding, + path=path, + metadata=metadata if metadata is not None else {}, + ) + + @classmethod + def from_data( + cls, + data: Union[str, bytes], + *, + encoding: str = "utf-8", + mime_type: Optional[str] = None, + path: Optional[str] = None, + metadata: Optional[dict] = None, + ) -> Blob: + """Initialize the blob from in-memory data. + + Args: + data: the in-memory data associated with the blob + encoding: Encoding to use if decoding the bytes into a string + mime_type: if provided, will be set as the mime-type of the data + path: if provided, will be set as the source from which the data came + metadata: Metadata to associate with the blob + + Returns: + Blob instance + """ + return cls( + data=data, + mimetype=mime_type, + encoding=encoding, + path=path, + metadata=metadata if metadata is not None else {}, + ) + + def __repr__(self) -> str: + """Define the blob representation.""" + str_repr = f"Blob {id(self)}" + if self.source: + str_repr += f" {self.source}" + return str_repr + + +class BlobLoader(ABC): + """Abstract interface for blob loaders implementation. + + Implementer should be able to load raw content from a storage system according + to some criteria and return the raw content lazily as a stream of blobs. + """ + + @abstractmethod + def yield_blobs( + self, + ) -> Iterable[Blob]: + """A lazy loader for raw data represented by LangChain's Blob object. + + Returns: + A generator over blobs + """ diff --git a/libs/core/poetry.lock b/libs/core/poetry.lock index 1cf584cae40..21f02abf9e4 100644 --- a/libs/core/poetry.lock +++ b/libs/core/poetry.lock @@ -1133,6 +1133,25 @@ files = [ {file = "jupyterlab_widgets-3.0.9.tar.gz", hash = "sha256:6005a4e974c7beee84060fdfba341a3218495046de8ae3ec64888e5fe19fdb4c"}, ] +[[package]] +name = "langchain-text-splitters" +version = "0.0.1" +description = "LangChain text splitting utilities" +optional = false +python-versions = ">=3.8.1,<4.0" +files = [] +develop = true + +[package.dependencies] +langchain-core = "^0.1.28" + +[package.extras] +extended-testing = ["lxml (>=5.1.0,<6.0.0)"] + +[package.source] +type = "directory" +url = "../text-splitters" + [[package]] name = "langsmith" version = "0.1.1" @@ -2815,4 +2834,4 @@ extended-testing = ["jinja2"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "de97591989f083b89c7a7bc6dabba87e29e13fddc812450d5196d564b2c02ce1" +content-hash = "092a56ee5733650e75cdacb0480d6a7fea1ff40a4a7f33500f77990a6e590ea4" diff --git a/libs/core/pyproject.toml b/libs/core/pyproject.toml index 04560296958..56443a7d84a 100644 --- a/libs/core/pyproject.toml +++ b/libs/core/pyproject.toml @@ -34,6 +34,7 @@ mypy = "^0.991" types-pyyaml = "^6.0.12.2" types-requests = "^2.28.11.5" types-jinja2 = "^2.11.9" +langchain-text-splitters = {path = "../text-splitters", develop = true} [tool.poetry.group.dev] optional = true