enhancement: support headers for non-html urls (#3166)

### Summary Updates the `UnstructuredURLLoader` to support passing in headers for non HTML content types. While this update maintains backward compatibility with older versions of `unstructured`, we strongly recommended upgrading to `unstructured>=0.5.13` if you are using the `UnstructuredURLLoader`. ### Testing #### With headers ```python from langchain.document_loaders import UnstructuredURLLoader urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"] loader = UnstructuredURLLoader(urls=urls, headers={"Accept": "application/json"}, strategy="fast") docs = loader.load() print(docs[0].page_content[:1000]) ``` #### Without headers ```python from langchain.document_loaders import UnstructuredURLLoader urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"] loader = UnstructuredURLLoader(urls=urls, strategy="fast") docs = loader.load() print(docs[0].page_content[:1000]) ``` --------- Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com>
2025-06-19 21:33:51 +00:00 · 2023-04-19 19:16:24 -04:00 · 2023-04-19 19:16:24 -04:00 · 3e0c44bae8
commit 3e0c44bae8
parent 7b1f0656b8
1 changed files with 33 additions and 14 deletions
--- a/langchain/document_loaders/url.py
+++ b/langchain/document_loaders/url.py
@ -15,7 +15,6 @@ class UnstructuredURLLoader(BaseLoader):
        self,
        urls: List[str],
        continue_on_failure: bool = True,
        headers: dict = {},
        **unstructured_kwargs: Any,
    ):
        """Initialize with file path."""
@ -30,9 +29,17 @@ class UnstructuredURLLoader(BaseLoader):
                "`pip install unstructured`"
            )
-        if not self.__is_headers_available() and len(headers.keys()) != 0:
+        headers = unstructured_kwargs.pop("headers", {})
        if len(headers.keys()) != 0:
            warn_about_headers = False
            if self.__is_non_html_available():
                warn_about_headers = not self.__is_headers_available_for_non_html()
            else:
                warn_about_headers = not self.__is_headers_available_for_html()
            if warn_about_headers:
                logger.warning(
-                "You are using old version of unstructured. "
+                    "You are using an old version of unstructured. "
                    "The headers parameter is ignored"
                )
@ -41,12 +48,18 @@ class UnstructuredURLLoader(BaseLoader):
        self.headers = headers
        self.unstructured_kwargs = unstructured_kwargs
-    def __is_headers_available(self) -> bool:
+    def __is_headers_available_for_html(self) -> bool:
        _unstructured_version = self.__version.split("-")[0]
        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
        return unstructured_version >= (0, 5, 7)
    def __is_headers_available_for_non_html(self) -> bool:
        _unstructured_version = self.__version.split("-")[0]
        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
        return unstructured_version >= (0, 5, 13)
    def __is_non_html_available(self) -> bool:
        _unstructured_version = self.__version.split("-")[0]
        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
@ -61,12 +74,18 @@ class UnstructuredURLLoader(BaseLoader):
        docs: List[Document] = list()
        for url in self.urls:
            try:
-                if self.headers and self.__is_headers_available():
+                if self.__is_non_html_available():
                    if self.__is_headers_available_for_non_html():
                        elements = partition(
                            url=url, headers=self.headers, **self.unstructured_kwargs
                        )
                    else:
                        elements = partition(url=url, **self.unstructured_kwargs)
                else:
                    if self.__is_headers_available_for_html():
                        elements = partition_html(
                            url=url, headers=self.headers, **self.unstructured_kwargs
                        )
                elif self.__is_non_html_available():
                    elements = partition(url=url, **self.unstructured_kwargs)
                    else:
                        elements = partition_html(url=url, **self.unstructured_kwargs)
            except Exception as e: