mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-06 13:18:12 +00:00
feat: add support for non-html in UnstructuredURLLoader
(#2793)
### Summary Adds support for processing non HTML document types in the URL loader. For example, the URL loader can now process a PDF or markdown files hosted at a URL. ### Testing ```python from langchain.document_loaders import UnstructuredURLLoader urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"] loader = UnstructuredURLLoader(urls=urls, strategy="fast") docs = loader.load() print(docs[0].page_content[:1000]) ```
This commit is contained in:
parent
e081c62aac
commit
f0be3b0689
@ -47,17 +47,26 @@ class UnstructuredURLLoader(BaseLoader):
|
|||||||
|
|
||||||
return unstructured_version >= (0, 5, 7)
|
return unstructured_version >= (0, 5, 7)
|
||||||
|
|
||||||
|
def __is_non_html_available(self) -> bool:
|
||||||
|
_unstructured_version = self.__version.split("-")[0]
|
||||||
|
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
|
||||||
|
|
||||||
|
return unstructured_version >= (0, 5, 12)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load file."""
|
"""Load file."""
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
docs: List[Document] = list()
|
docs: List[Document] = list()
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
try:
|
try:
|
||||||
if self.__is_headers_available():
|
if self.headers and self.__is_headers_available():
|
||||||
elements = partition_html(
|
elements = partition_html(
|
||||||
url=url, headers=self.headers, **self.unstructured_kwargs
|
url=url, headers=self.headers, **self.unstructured_kwargs
|
||||||
)
|
)
|
||||||
|
elif self.__is_non_html_available():
|
||||||
|
elements = partition(url=url, **self.unstructured_kwargs)
|
||||||
else:
|
else:
|
||||||
elements = partition_html(url=url, **self.unstructured_kwargs)
|
elements = partition_html(url=url, **self.unstructured_kwargs)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
Loading…
Reference in New Issue
Block a user