mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-30 11:39:03 +00:00
## Description This pull request introduces the `DocumentLoaderAsParser` class, which acts as an adapter to transform document loaders into parsers within the LangChain framework. The class enables document loaders that accept a `file_path` parameter to be utilized as blob parsers. This is particularly useful for integrating various document loading capabilities seamlessly into the LangChain ecosystem. When merged in together with PR https://github.com/langchain-ai/langchain/pull/27716 It opens options for `SharePointLoader` / `OneDriveLoader` to process any filetype that has a document loader. ### Features - **Flexible Parsing**: The `DocumentLoaderAsParser` class can adapt any document loader that meets the criteria of accepting a `file_path` argument, allowing for lazy parsing of documents. - **Compatibility**: The class has been designed to work with various document loaders, making it versatile for different use cases. ### Usage Example To use the `DocumentLoaderAsParser`, you would initialize it with a suitable document loader class and any required parameters. Here’s an example of how to do this with the `UnstructuredExcelLoader`: ```python from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.parsers.documentloader_adapter import DocumentLoaderAsParser from langchain_community.document_loaders.excel import UnstructuredExcelLoader # Initialize the parser adapter with UnstructuredExcelLoader xlsx_parser = DocumentLoaderAsParser(UnstructuredExcelLoader, mode="paged") # Use parser, for ex. pass it to MimeTypeBasedParser MimeTypeBasedParser( handlers={ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": xlsx_parser } ) ``` - **Dependencies:** None - **Twitter handle:** @martintriska1 If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Chester Curme <chester.curme@gmail.com>
68 lines
2.6 KiB
Python
68 lines
2.6 KiB
Python
import inspect
|
|
from typing import Any, Dict, Iterator, Type
|
|
|
|
from langchain.document_loaders.base import BaseBlobParser, BaseLoader
|
|
from langchain_core._api import beta
|
|
from langchain_core.documents import Document
|
|
from langchain_core.documents.base import Blob
|
|
|
|
|
|
@beta()
|
|
class DocumentLoaderAsParser(BaseBlobParser):
|
|
"""A wrapper class that adapts a document loader to function as a parser.
|
|
|
|
This class is a work-around that adapts a document loader to function as a parser.
|
|
It is recommended to use a proper parser, if available.
|
|
|
|
Requires the document loader to accept a `file_path` parameter.
|
|
"""
|
|
|
|
DocumentLoaderType: Type[BaseLoader]
|
|
doc_loader_kwargs: Dict[str, Any]
|
|
|
|
def __init__(self, document_loader_class: Type[BaseLoader], **kwargs: Any) -> None:
|
|
"""
|
|
Initializes the DocumentLoaderAsParser with a specific document loader class
|
|
and additional arguments.
|
|
|
|
Args:
|
|
document_loader_class (Type[BaseLoader]): The document loader class to adapt
|
|
as a parser.
|
|
**kwargs: Additional arguments passed to the document loader's constructor.
|
|
|
|
Raises:
|
|
TypeError: If the specified document loader does not accept a `file_path` parameter,
|
|
an exception is raised, as only loaders with this parameter can be adapted.
|
|
|
|
Example:
|
|
```
|
|
from langchain_community.document_loaders.excel import UnstructuredExcelLoader
|
|
|
|
# Initialize parser adapter with a document loader
|
|
excel_parser = DocumentLoaderAsParser(UnstructuredExcelLoader, mode="elements")
|
|
```
|
|
""" # noqa: E501
|
|
super().__init__()
|
|
self.DocumentLoaderClass = document_loader_class
|
|
self.document_loader_kwargs = kwargs
|
|
|
|
# Ensure the document loader class has a `file_path` parameter
|
|
init_signature = inspect.signature(document_loader_class.__init__)
|
|
if "file_path" not in init_signature.parameters:
|
|
raise TypeError(
|
|
f"{document_loader_class.__name__} does not accept `file_path`."
|
|
"Only document loaders with `file_path` parameter"
|
|
"can be morphed into a parser."
|
|
)
|
|
|
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
"""
|
|
Use underlying DocumentLoader to lazily parse the blob.
|
|
"""
|
|
doc_loader = self.DocumentLoaderClass(
|
|
file_path=blob.path, **self.document_loader_kwargs
|
|
) # type: ignore
|
|
for document in doc_loader.lazy_load():
|
|
document.metadata.update(blob.metadata)
|
|
yield document
|