mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-02 11:39:18 +00:00
docs, langchain-unstructured: update langchain-unstructured docs and update ustructured-client dependency (#25451)
Be more explicit in the docs about creating an instance of the UnstructuredClient if you want to customize it versus using sdk parameters with the UnstructuredLoader. Bump the unstructured-client dependency as discussed [here](https://github.com/langchain-ai/langchain/discussions/25328#discussioncomment-10350949) --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
@@ -105,7 +105,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 3,
|
||||
"id": "79d3e549",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -131,7 +131,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 4,
|
||||
"id": "8da59ef8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -139,17 +139,16 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO: NumExpr defaulting to 12 threads.\n",
|
||||
"INFO: pikepdf C++ to Python logger bridge initialized\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')"
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-02-27T15:49:27', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -162,7 +161,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 5,
|
||||
"id": "97f7aa1f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -170,7 +169,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}\n"
|
||||
"{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-02-27T15:49:27', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -188,17 +187,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 6,
|
||||
"id": "b05604d2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')"
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-02-27T15:49:27', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -279,7 +278,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 8,
|
||||
"id": "386eb63c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -299,7 +298,7 @@
|
||||
"Document(metadata={'source': 'example_data/fake.docx', 'category_depth': 0, 'filename': 'fake.docx', 'languages': ['por', 'cat'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': '56d531394823d81787d77a04462ed096'}, page_content='Lorem ipsum dolor sit amet.')"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -327,7 +326,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 9,
|
||||
"id": "a3d7c846",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -375,16 +374,22 @@
|
||||
"### Unstructured SDK Client\n",
|
||||
"\n",
|
||||
"Partitioning with the Unstructured API relies on the [Unstructured SDK\n",
|
||||
"Client](https://docs.unstructured.io/api-reference/api-services/sdk).\n",
|
||||
"Client](https://docs.unstructured.io/api-reference/api-services/accessing-unstructured-api).\n",
|
||||
"\n",
|
||||
"Below is an example showing how you can customize some features of the client and use your own `requests.Session()`, pass in an alternative `server_url`, or customize the `RetryConfig` object for more control over how failed requests are handled.\n",
|
||||
"\n",
|
||||
"Note that the example below may not use the latest version of the UnstructuredClient and there could be breaking changes in future releases. For the latest examples, refer to the [Unstructured Python SDK](https://docs.unstructured.io/api-reference/api-services/sdk-python) docs."
|
||||
"If you want to customize the client, you will have to pass an `UnstructuredClient` instance to the `UnstructuredLoader`. Below is an example showing how you can customize features of the client such as using your own `requests.Session()`, passing an alternative `server_url`, and customizing the `RetryConfig` object. For more information about customizing the client or what additional parameters the sdk client accepts, refer to the [Unstructured Python SDK](https://docs.unstructured.io/api-reference/api-services/sdk-python) docs and the client section of the [API Parameters](https://docs.unstructured.io/api-reference/api-services/api-parameters) docs. Note that all API Parameters should be passed to the `UnstructuredLoader`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ebb69c85",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<div class=\"alert alert-block alert-warning\"><b>Warning:</b> The example below may not use the latest version of the UnstructuredClient and there could be breaking changes in future releases. For the latest examples, refer to the <a href=\"https://docs.unstructured.io/api-reference/api-services/sdk-python\">Unstructured Python SDK</a> docs.</div>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 10,
|
||||
"id": "58e55264",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -394,13 +399,15 @@
|
||||
"text": [
|
||||
"INFO: Preparing to split document for partition.\n",
|
||||
"INFO: Concurrency level set to 5\n",
|
||||
"INFO: Splitting pages 1 to 16 (16 total)\n",
|
||||
"INFO: Determined optimal split size of 4 pages.\n",
|
||||
"INFO: Partitioning 4 files with 4 page(s) each.\n",
|
||||
"INFO: Partitioning set #1 (pages 1-4).\n",
|
||||
"INFO: Partitioning set #2 (pages 5-8).\n",
|
||||
"INFO: Partitioning set #3 (pages 9-12).\n",
|
||||
"INFO: Partitioning set #4 (pages 13-16).\n",
|
||||
"INFO: Splitting pages 1 to 10 (10 total)\n",
|
||||
"INFO: Determined optimal split size of 2 pages.\n",
|
||||
"INFO: Partitioning 5 files with 2 page(s) each.\n",
|
||||
"INFO: Partitioning set #1 (pages 1-2).\n",
|
||||
"INFO: Partitioning set #2 (pages 3-4).\n",
|
||||
"INFO: Partitioning set #3 (pages 5-6).\n",
|
||||
"INFO: Partitioning set #4 (pages 7-8).\n",
|
||||
"INFO: Partitioning set #5 (pages 9-10).\n",
|
||||
"INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n",
|
||||
"INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n",
|
||||
"INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n",
|
||||
"INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n",
|
||||
@@ -408,6 +415,7 @@
|
||||
"INFO: Successfully partitioned set #2, elements added to the final result.\n",
|
||||
"INFO: Successfully partitioned set #3, elements added to the final result.\n",
|
||||
"INFO: Successfully partitioned set #4, elements added to the final result.\n",
|
||||
"INFO: Successfully partitioned set #5, elements added to the final result.\n",
|
||||
"INFO: Successfully partitioned the document.\n"
|
||||
]
|
||||
},
|
||||
@@ -429,8 +437,8 @@
|
||||
" api_key_auth=os.getenv(\n",
|
||||
" \"UNSTRUCTURED_API_KEY\"\n",
|
||||
" ), # Note: the client API param is \"api_key_auth\" instead of \"api_key\"\n",
|
||||
" client=requests.Session(),\n",
|
||||
" server_url=\"https://api.unstructuredapp.io/general/v0/general\",\n",
|
||||
" client=requests.Session(), # Define your own requests session\n",
|
||||
" server_url=\"https://api.unstructuredapp.io/general/v0/general\", # Define your own api url\n",
|
||||
" retry_config=RetryConfig(\n",
|
||||
" strategy=\"backoff\",\n",
|
||||
" retry_connection_errors=True,\n",
|
||||
@@ -440,13 +448,15 @@
|
||||
" exponent=1.5,\n",
|
||||
" max_elapsed_time=900000,\n",
|
||||
" ),\n",
|
||||
" ),\n",
|
||||
" ), # Define your own retry config\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"loader = UnstructuredLoader(\n",
|
||||
" \"./example_data/layout-parser-paper.pdf\",\n",
|
||||
" partition_via_api=True,\n",
|
||||
" client=client,\n",
|
||||
" split_pdf_page=True,\n",
|
||||
" split_pdf_page_range=[1, 10],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
@@ -479,17 +489,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 11,
|
||||
"id": "e9f1c20d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WARNING: Partitioning locally even though api_key is defined since partition_via_api=False.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
@@ -542,7 +545,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
Reference in New Issue
Block a user