mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-07 13:40:46 +00:00
community: add init for UnstructuredHTMLLoader
to solve pathlib paths (#29091)
## Description Add `__init__` for `UnstructuredHTMLLoader` to restrict the input type to `str` or `Path`, and transfer the `self.file_path` to `str` just like `UnstructuredXMLLoader` does. ## Issue Fix #29090 ## Dependencies No changes.
This commit is contained in:
parent
c8ca1cd42f
commit
2b09f798e1
@ -1,4 +1,5 @@
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
@ -27,6 +28,23 @@ class UnstructuredHTMLLoader(UnstructuredFileLoader):
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition-html
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the HTML file to load.
|
||||
mode: The mode to use when loading the file. Can be one of "single",
|
||||
"multi", or "all". Default is "single".
|
||||
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user