mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-25 21:37:20 +00:00
unstructured[patch]: support loading URLs (#26670)
`unstructured.partition.auto.partition` supports a `url` kwarg, but `url` in `UnstructuredLoader.__init__` is reserved for the server URL. Here we add a `web_url` kwarg that is passed to the partition kwargs: ```python self.unstructured_kwargs["url"] = web_url ```
This commit is contained in:
@@ -128,6 +128,17 @@ def test_loader_partitions_locally_and_applies_post_processors(
|
||||
assert docs[0].page_content.endswith("THE END!")
|
||||
|
||||
|
||||
@pytest.mark.local
|
||||
def test_url_loader() -> None:
|
||||
docs = UnstructuredLoader(web_url="https://www.example.com/").load()
|
||||
|
||||
for doc in docs:
|
||||
assert doc.page_content
|
||||
assert doc.metadata["filetype"] == "text/html"
|
||||
assert doc.metadata["url"] == "https://www.example.com/"
|
||||
assert doc.metadata["category"]
|
||||
|
||||
|
||||
# -- API partition --
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user