mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 20:28:10 +00:00
fix: pass unstructured kwargs down in all unstructured loaders (#2506)
### Summary #1667 updated several Unstructured loaders to accept `unstructured_kwargs` in the `__init__` function. However, the previous PR did not add this functionality to every Unstructured loader. This PR ensures `unstructured_kwargs` are passed in all remaining Unstructured loaders.
This commit is contained in:
parent
c913acdb4c
commit
270384fb44
@ -21,11 +21,11 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
|
|||||||
if filetype == FileType.EML:
|
if filetype == FileType.EML:
|
||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
|
|
||||||
return partition_email(filename=self.file_path)
|
return partition_email(filename=self.file_path, **self.unstructured_kwargs)
|
||||||
elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG:
|
elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG:
|
||||||
from unstructured.partition.msg import partition_msg
|
from unstructured.partition.msg import partition_msg
|
||||||
|
|
||||||
return partition_msg(filename=self.file_path)
|
return partition_msg(filename=self.file_path, **self.unstructured_kwargs)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Filetype {filetype} is not supported in UnstructuredEmailLoader."
|
f"Filetype {filetype} is not supported in UnstructuredEmailLoader."
|
||||||
|
@ -19,4 +19,4 @@ class UnstructuredEPubLoader(UnstructuredFileLoader):
|
|||||||
)
|
)
|
||||||
from unstructured.partition.epub import partition_epub
|
from unstructured.partition.epub import partition_epub
|
||||||
|
|
||||||
return partition_epub(filename=self.file_path)
|
return partition_epub(filename=self.file_path, **self.unstructured_kwargs)
|
||||||
|
@ -22,4 +22,4 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
|||||||
"Partitioning markdown files is only supported in unstructured>=0.4.16."
|
"Partitioning markdown files is only supported in unstructured>=0.4.16."
|
||||||
)
|
)
|
||||||
|
|
||||||
return partition_md(filename=self.file_path)
|
return partition_md(filename=self.file_path, **self.unstructured_kwargs)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
"""Loader that uses unstructured to load HTML files."""
|
"""Loader that uses unstructured to load HTML files."""
|
||||||
import logging
|
import logging
|
||||||
from typing import List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
@ -12,7 +12,11 @@ class UnstructuredURLLoader(BaseLoader):
|
|||||||
"""Loader that uses unstructured to load HTML files."""
|
"""Loader that uses unstructured to load HTML files."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, urls: List[str], continue_on_failure: bool = True, headers: dict = {}
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
continue_on_failure: bool = True,
|
||||||
|
headers: dict = {},
|
||||||
|
**unstructured_kwargs: Any,
|
||||||
):
|
):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
try:
|
try:
|
||||||
@ -35,6 +39,7 @@ class UnstructuredURLLoader(BaseLoader):
|
|||||||
self.urls = urls
|
self.urls = urls
|
||||||
self.continue_on_failure = continue_on_failure
|
self.continue_on_failure = continue_on_failure
|
||||||
self.headers = headers
|
self.headers = headers
|
||||||
|
self.unstructured_kwargs = unstructured_kwargs
|
||||||
|
|
||||||
def __is_headers_available(self) -> bool:
|
def __is_headers_available(self) -> bool:
|
||||||
_unstructured_version = self.__version.split("-")[0]
|
_unstructured_version = self.__version.split("-")[0]
|
||||||
@ -50,9 +55,11 @@ class UnstructuredURLLoader(BaseLoader):
|
|||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
try:
|
try:
|
||||||
if self.__is_headers_available():
|
if self.__is_headers_available():
|
||||||
elements = partition_html(url=url, headers=self.headers)
|
elements = partition_html(
|
||||||
|
url=url, headers=self.headers, **self.unstructured_kwargs
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
elements = partition_html(url=url)
|
elements = partition_html(url=url, **self.unstructured_kwargs)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.continue_on_failure:
|
if self.continue_on_failure:
|
||||||
logger.error(f"Error fetching or processing {url}, exeption: {e}")
|
logger.error(f"Error fetching or processing {url}, exeption: {e}")
|
||||||
|
Loading…
Reference in New Issue
Block a user