mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 15:43:54 +00:00
unstructured[patch]: support loading URLs (#26670)
`unstructured.partition.auto.partition` supports a `url` kwarg, but `url` in `UnstructuredLoader.__init__` is reserved for the server URL. Here we add a `web_url` kwarg that is passed to the partition kwargs: ```python self.unstructured_kwargs["url"] = web_url ```
This commit is contained in:
parent
311f861547
commit
eef18dec44
@ -16,7 +16,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/docs/integrations/document_loaders/file_loaders/unstructured/)|\n",
|
"| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/docs/integrations/document_loaders/file_loaders/unstructured/)|\n",
|
||||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||||
"| [UnstructuredLoader](https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html) | [langchain_community](https://python.langchain.com/api_reference/unstructured/index.html) | ✅ | ❌ | ✅ | \n",
|
"| [UnstructuredLoader](https://python.langchain.com/api_reference/unstructured/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html) | [langchain_unstructured](https://python.langchain.com/api_reference/unstructured/index.html) | ✅ | ❌ | ✅ | \n",
|
||||||
"### Loader features\n",
|
"### Loader features\n",
|
||||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||||
"| :---: | :---: | :---: | \n",
|
"| :---: | :---: | :---: | \n",
|
||||||
@ -519,6 +519,47 @@
|
|||||||
"print(\"Length of text in the document:\", len(docs[0].page_content))"
|
"print(\"Length of text in the document:\", len(docs[0].page_content))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "3ec3c22d-02cd-498b-921f-b839d1404f32",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Loading web pages\n",
|
||||||
|
"\n",
|
||||||
|
"`UnstructuredLoader` accepts a `web_url` kwarg when run locally that populates the `url` parameter of the underlying Unstructured [partition](https://docs.unstructured.io/open-source/core-functionality/partitioning). This allows for the parsing of remotely hosted documents, such as HTML web pages.\n",
|
||||||
|
"\n",
|
||||||
|
"Example usage:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "bf9a8546-659d-4861-bff2-fdf1ad93ac65",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"page_content='Example Domain' metadata={'category_depth': 0, 'languages': ['eng'], 'filetype': 'text/html', 'url': 'https://www.example.com', 'category': 'Title', 'element_id': 'fdaa78d856f9d143aeeed85bf23f58f8'}\n",
|
||||||
|
"\n",
|
||||||
|
"page_content='This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.' metadata={'languages': ['eng'], 'parent_id': 'fdaa78d856f9d143aeeed85bf23f58f8', 'filetype': 'text/html', 'url': 'https://www.example.com', 'category': 'NarrativeText', 'element_id': '3652b8458b0688639f973fe36253c992'}\n",
|
||||||
|
"\n",
|
||||||
|
"page_content='More information...' metadata={'category_depth': 0, 'link_texts': ['More information...'], 'link_urls': ['https://www.iana.org/domains/example'], 'languages': ['eng'], 'filetype': 'text/html', 'url': 'https://www.example.com', 'category': 'Title', 'element_id': '793ab98565d6f6d6f3a6d614e3ace2a9'}\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain_unstructured import UnstructuredLoader\n",
|
||||||
|
"\n",
|
||||||
|
"loader = UnstructuredLoader(web_url=\"https://www.example.com\")\n",
|
||||||
|
"docs = loader.load()\n",
|
||||||
|
"\n",
|
||||||
|
"for doc in docs:\n",
|
||||||
|
" print(f\"{doc}\\n\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "ce01aa40",
|
"id": "ce01aa40",
|
||||||
@ -546,7 +587,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.13"
|
"version": "3.10.4"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -76,6 +76,25 @@ class UnstructuredLoader(BaseLoader):
|
|||||||
{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}
|
{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}
|
||||||
|
|
||||||
|
|
||||||
|
Load URL:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
loader = UnstructuredLoader(web_url="https://www.example.com/")
|
||||||
|
print(docs[0])
|
||||||
|
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
page_content='Example Domain' metadata={'category_depth': 0, 'languages': ['eng'], 'filetype': 'text/html', 'url': 'https://www.example.com/', 'category': 'Title', 'element_id': 'fdaa78d856f9d143aeeed85bf23f58f8'}
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
print(docs[1])
|
||||||
|
|
||||||
|
.. code-block:: none
|
||||||
|
|
||||||
|
page_content='This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.' metadata={'languages': ['eng'], 'parent_id': 'fdaa78d856f9d143aeeed85bf23f58f8', 'filetype': 'text/html', 'url': 'https://www.example.com/', 'category': 'NarrativeText', 'element_id': '3652b8458b0688639f973fe36253c992'}
|
||||||
|
|
||||||
|
|
||||||
References
|
References
|
||||||
----------
|
----------
|
||||||
https://docs.unstructured.io/api-reference/api-services/sdk
|
https://docs.unstructured.io/api-reference/api-services/sdk
|
||||||
@ -95,6 +114,7 @@ class UnstructuredLoader(BaseLoader):
|
|||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
client: Optional[UnstructuredClient] = None,
|
client: Optional[UnstructuredClient] = None,
|
||||||
url: Optional[str] = None,
|
url: Optional[str] = None,
|
||||||
|
web_url: Optional[str] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
):
|
):
|
||||||
"""Initialize loader."""
|
"""Initialize loader."""
|
||||||
@ -124,6 +144,8 @@ class UnstructuredLoader(BaseLoader):
|
|||||||
self.partition_via_api = partition_via_api
|
self.partition_via_api = partition_via_api
|
||||||
self.post_processors = post_processors
|
self.post_processors = post_processors
|
||||||
self.unstructured_kwargs = kwargs
|
self.unstructured_kwargs = kwargs
|
||||||
|
if web_url:
|
||||||
|
self.unstructured_kwargs["url"] = web_url
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load file(s) to the _UnstructuredBaseLoader."""
|
"""Load file(s) to the _UnstructuredBaseLoader."""
|
||||||
|
@ -128,6 +128,17 @@ def test_loader_partitions_locally_and_applies_post_processors(
|
|||||||
assert docs[0].page_content.endswith("THE END!")
|
assert docs[0].page_content.endswith("THE END!")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.local
|
||||||
|
def test_url_loader() -> None:
|
||||||
|
docs = UnstructuredLoader(web_url="https://www.example.com/").load()
|
||||||
|
|
||||||
|
for doc in docs:
|
||||||
|
assert doc.page_content
|
||||||
|
assert doc.metadata["filetype"] == "text/html"
|
||||||
|
assert doc.metadata["url"] == "https://www.example.com/"
|
||||||
|
assert doc.metadata["category"]
|
||||||
|
|
||||||
|
|
||||||
# -- API partition --
|
# -- API partition --
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user