fix: pass unstructured kwargs down in all unstructured loaders (#2506)

### Summary

#1667 updated several Unstructured loaders to accept
`unstructured_kwargs` in the `__init__` function. However, the previous
PR did not add this functionality to every Unstructured loader. This PR
ensures `unstructured_kwargs` are passed in all remaining Unstructured
loaders.
This commit is contained in:
Matt Robinson 2023-04-06 15:29:52 -04:00 committed by GitHub
parent c913acdb4c
commit 270384fb44
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 15 additions and 8 deletions

View File

@ -21,11 +21,11 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
if filetype == FileType.EML: if filetype == FileType.EML:
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
return partition_email(filename=self.file_path) return partition_email(filename=self.file_path, **self.unstructured_kwargs)
elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG: elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG:
from unstructured.partition.msg import partition_msg from unstructured.partition.msg import partition_msg
return partition_msg(filename=self.file_path) return partition_msg(filename=self.file_path, **self.unstructured_kwargs)
else: else:
raise ValueError( raise ValueError(
f"Filetype {filetype} is not supported in UnstructuredEmailLoader." f"Filetype {filetype} is not supported in UnstructuredEmailLoader."

View File

@ -19,4 +19,4 @@ class UnstructuredEPubLoader(UnstructuredFileLoader):
) )
from unstructured.partition.epub import partition_epub from unstructured.partition.epub import partition_epub
return partition_epub(filename=self.file_path) return partition_epub(filename=self.file_path, **self.unstructured_kwargs)

View File

@ -22,4 +22,4 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
"Partitioning markdown files is only supported in unstructured>=0.4.16." "Partitioning markdown files is only supported in unstructured>=0.4.16."
) )
return partition_md(filename=self.file_path) return partition_md(filename=self.file_path, **self.unstructured_kwargs)

View File

@ -1,6 +1,6 @@
"""Loader that uses unstructured to load HTML files.""" """Loader that uses unstructured to load HTML files."""
import logging import logging
from typing import List from typing import Any, List
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
@ -12,7 +12,11 @@ class UnstructuredURLLoader(BaseLoader):
"""Loader that uses unstructured to load HTML files.""" """Loader that uses unstructured to load HTML files."""
def __init__( def __init__(
self, urls: List[str], continue_on_failure: bool = True, headers: dict = {} self,
urls: List[str],
continue_on_failure: bool = True,
headers: dict = {},
**unstructured_kwargs: Any,
): ):
"""Initialize with file path.""" """Initialize with file path."""
try: try:
@ -35,6 +39,7 @@ class UnstructuredURLLoader(BaseLoader):
self.urls = urls self.urls = urls
self.continue_on_failure = continue_on_failure self.continue_on_failure = continue_on_failure
self.headers = headers self.headers = headers
self.unstructured_kwargs = unstructured_kwargs
def __is_headers_available(self) -> bool: def __is_headers_available(self) -> bool:
_unstructured_version = self.__version.split("-")[0] _unstructured_version = self.__version.split("-")[0]
@ -50,9 +55,11 @@ class UnstructuredURLLoader(BaseLoader):
for url in self.urls: for url in self.urls:
try: try:
if self.__is_headers_available(): if self.__is_headers_available():
elements = partition_html(url=url, headers=self.headers) elements = partition_html(
url=url, headers=self.headers, **self.unstructured_kwargs
)
else: else:
elements = partition_html(url=url) elements = partition_html(url=url, **self.unstructured_kwargs)
except Exception as e: except Exception as e:
if self.continue_on_failure: if self.continue_on_failure:
logger.error(f"Error fetching or processing {url}, exeption: {e}") logger.error(f"Error fetching or processing {url}, exeption: {e}")