unstructured[patch]: support loading URLs (#26670)

`unstructured.partition.auto.partition` supports a `url` kwarg, but
`url` in `UnstructuredLoader.__init__` is reserved for the server URL.
Here we add a `web_url` kwarg that is passed to the partition kwargs:
```python
self.unstructured_kwargs["url"] = web_url
```
This commit is contained in:
ccurme
2024-09-19 14:40:25 -04:00
committed by GitHub
parent 311f861547
commit eef18dec44
3 changed files with 76 additions and 2 deletions

View File

@@ -128,6 +128,17 @@ def test_loader_partitions_locally_and_applies_post_processors(
assert docs[0].page_content.endswith("THE END!")
@pytest.mark.local
def test_url_loader() -> None:
docs = UnstructuredLoader(web_url="https://www.example.com/").load()
for doc in docs:
assert doc.page_content
assert doc.metadata["filetype"] == "text/html"
assert doc.metadata["url"] == "https://www.example.com/"
assert doc.metadata["category"]
# -- API partition --