mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-29 11:09:07 +00:00
community: DocumentLoaderAsParser wrapper (#27749)
## Description This pull request introduces the `DocumentLoaderAsParser` class, which acts as an adapter to transform document loaders into parsers within the LangChain framework. The class enables document loaders that accept a `file_path` parameter to be utilized as blob parsers. This is particularly useful for integrating various document loading capabilities seamlessly into the LangChain ecosystem. When merged in together with PR https://github.com/langchain-ai/langchain/pull/27716 It opens options for `SharePointLoader` / `OneDriveLoader` to process any filetype that has a document loader. ### Features - **Flexible Parsing**: The `DocumentLoaderAsParser` class can adapt any document loader that meets the criteria of accepting a `file_path` argument, allowing for lazy parsing of documents. - **Compatibility**: The class has been designed to work with various document loaders, making it versatile for different use cases. ### Usage Example To use the `DocumentLoaderAsParser`, you would initialize it with a suitable document loader class and any required parameters. Here’s an example of how to do this with the `UnstructuredExcelLoader`: ```python from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.parsers.documentloader_adapter import DocumentLoaderAsParser from langchain_community.document_loaders.excel import UnstructuredExcelLoader # Initialize the parser adapter with UnstructuredExcelLoader xlsx_parser = DocumentLoaderAsParser(UnstructuredExcelLoader, mode="paged") # Use parser, for ex. pass it to MimeTypeBasedParser MimeTypeBasedParser( handlers={ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": xlsx_parser } ) ``` - **Dependencies:** None - **Twitter handle:** @martintriska1 If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
parent
9b024d00c9
commit
e6b41d081d
@ -0,0 +1,67 @@
|
||||
import inspect
|
||||
from typing import Any, Dict, Iterator, Type
|
||||
|
||||
from langchain.document_loaders.base import BaseBlobParser, BaseLoader
|
||||
from langchain_core._api import beta
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.documents.base import Blob
|
||||
|
||||
|
||||
@beta()
|
||||
class DocumentLoaderAsParser(BaseBlobParser):
|
||||
"""A wrapper class that adapts a document loader to function as a parser.
|
||||
|
||||
This class is a work-around that adapts a document loader to function as a parser.
|
||||
It is recommended to use a proper parser, if available.
|
||||
|
||||
Requires the document loader to accept a `file_path` parameter.
|
||||
"""
|
||||
|
||||
DocumentLoaderType: Type[BaseLoader]
|
||||
doc_loader_kwargs: Dict[str, Any]
|
||||
|
||||
def __init__(self, document_loader_class: Type[BaseLoader], **kwargs: Any) -> None:
|
||||
"""
|
||||
Initializes the DocumentLoaderAsParser with a specific document loader class
|
||||
and additional arguments.
|
||||
|
||||
Args:
|
||||
document_loader_class (Type[BaseLoader]): The document loader class to adapt
|
||||
as a parser.
|
||||
**kwargs: Additional arguments passed to the document loader's constructor.
|
||||
|
||||
Raises:
|
||||
TypeError: If the specified document loader does not accept a `file_path` parameter,
|
||||
an exception is raised, as only loaders with this parameter can be adapted.
|
||||
|
||||
Example:
|
||||
```
|
||||
from langchain_community.document_loaders.excel import UnstructuredExcelLoader
|
||||
|
||||
# Initialize parser adapter with a document loader
|
||||
excel_parser = DocumentLoaderAsParser(UnstructuredExcelLoader, mode="elements")
|
||||
```
|
||||
""" # noqa: E501
|
||||
super().__init__()
|
||||
self.DocumentLoaderClass = document_loader_class
|
||||
self.document_loader_kwargs = kwargs
|
||||
|
||||
# Ensure the document loader class has a `file_path` parameter
|
||||
init_signature = inspect.signature(document_loader_class.__init__)
|
||||
if "file_path" not in init_signature.parameters:
|
||||
raise TypeError(
|
||||
f"{document_loader_class.__name__} does not accept `file_path`."
|
||||
"Only document loaders with `file_path` parameter"
|
||||
"can be morphed into a parser."
|
||||
)
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""
|
||||
Use underlying DocumentLoader to lazily parse the blob.
|
||||
"""
|
||||
doc_loader = self.DocumentLoaderClass(
|
||||
file_path=blob.path, **self.document_loader_kwargs
|
||||
) # type: ignore
|
||||
for document in doc_loader.lazy_load():
|
||||
document.metadata.update(blob.metadata)
|
||||
yield document
|
Loading…
Reference in New Issue
Block a user