community: add init for UnstructuredHTMLLoader to solve pathlib paths (#29091)

## Description
Add `__init__` for `UnstructuredHTMLLoader` to restrict the input type
to `str` or `Path`, and transfer the `self.file_path` to `str` just like
`UnstructuredXMLLoader` does.

## Issue
Fix #29090 

## Dependencies
No changes.
This commit is contained in:
LIU Yuwei 2025-01-08 23:19:27 +08:00 committed by GitHub
parent c8ca1cd42f
commit 2b09f798e1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,4 +1,5 @@
from typing import List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
@ -27,6 +28,23 @@ class UnstructuredHTMLLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-html
"""
def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Args:
file_path: The path to the HTML file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.html import partition_html