Harrison/unstructured structured (#1004)

2025-09-07 05:52:15 +00:00 · 2023-02-12 07:36:11 -08:00
parent bbb06ca4cf
commit 0998577dfe
11 changed files with 363 additions and 121 deletions
--- a/docs/modules/document_loaders/examples/email.ipynb
+++ b/docs/modules/document_loaders/examples/email.ipynb
@@ -61,10 +61,61 @@
    "data"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "8bf50cba",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b9592eaf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredEmailLoader('example_data/fake-email.eml', mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "0b16d03f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d7bdc5e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='This is a test email to use for unit tests.', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0]"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "4ef9a5f4",
+   "id": "6a074515",
   "metadata": {},
   "outputs": [],
   "source": []
@@ -86,7 +137,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,
--- a/docs/modules/document_loaders/examples/microsoft_word.ipynb
+++ b/docs/modules/document_loaders/examples/microsoft_word.ipynb
@@ -61,10 +61,61 @@
    "data"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "5d1472e9",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "93abf60b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredDocxLoader('example_data/fake.docx', mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c35cdbcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fae2d730",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "61953c83",
+   "id": "961a7b1d",
   "metadata": {},
   "outputs": [],
   "source": []
@@ -86,7 +137,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,
--- a/docs/modules/document_loaders/examples/pdf.ipynb
+++ b/docs/modules/document_loaders/examples/pdf.ipynb
@@ -139,7 +139,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
   "id": "0cc0cd42",
   "metadata": {},
   "outputs": [],
@@ -149,7 +149,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
   "id": "082d557c",
   "metadata": {},
   "outputs": [],
@@ -159,14 +159,54 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
-   "id": "5c41106f",
+   "execution_count": null,
+   "id": "df11c953",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loader.load()"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "09957371",
+   "metadata": {},
+   "source": [
+    "### Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fab833b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredPDFLoader(\"example_data/layout-parser-paper.pdf\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3e8ff1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43c23d2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data[0]"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "21998d18",
@@ -177,7 +217,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 7,
   "id": "2f0cc9ff",
   "metadata": {},
   "outputs": [],
@@ -187,7 +227,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 8,
   "id": "42b531e8",
   "metadata": {},
   "outputs": [],
@@ -197,7 +237,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
   "id": "010d5cdd",
   "metadata": {},
   "outputs": [],
--- a/docs/modules/document_loaders/examples/powerpoint.ipynb
+++ b/docs/modules/document_loaders/examples/powerpoint.ipynb
@@ -61,10 +61,61 @@
    "data"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "525d6b67",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "064f9162",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredPowerPointLoader(\"example_data/fake-power-point.pptx\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "abefbbdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a547c534",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='Adding a Bullet Slide', lookup_str='', metadata={'source': 'example_data/fake-power-point.pptx'}, lookup_index=0)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0]"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "0c55f1cf",
+   "id": "381d4139",
   "metadata": {},
   "outputs": [],
   "source": []
@@ -86,7 +137,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,
--- a/docs/modules/document_loaders/examples/unstructured_file.ipynb
+++ b/docs/modules/document_loaders/examples/unstructured_file.ipynb
@@ -12,6 +12,40 @@
  {
   "cell_type": "code",
   "execution_count": 1,
+   "id": "2886982e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Install package\n",
+    "# !pip install unstructured"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "54d62efd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Install other dependencies\n",
+    "# # https://github.com/Unstructured-IO/unstructured/blob/main/docs/source/installing.rst\n",
+    "# !brew install libmagic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "af6a64f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import nltk\n",
+    "# nltk.download('punkt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
   "id": "79d3e549",
   "metadata": {},
   "outputs": [],
@@ -21,7 +55,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
   "id": "2593d1dc",
   "metadata": {},
   "outputs": [],
@@ -31,7 +65,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
   "id": "fe34e941",
   "metadata": {},
   "outputs": [],
@@ -39,10 +73,86 @@
    "docs = loader.load()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ee449788",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\\n\\nLast year COVID-19 kept us apart. This year we are finally together again.\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\\n\\nWith a duty to one another to the American people to the Constit'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[0].page_content[:400]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7874d01d",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ff5b616d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredFileLoader(\"../../state_of_the_union.txt\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "feca3b6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fec5bbac",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='Last year COVID-19 kept us apart. This year we are finally together again.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='With a duty to one another to the American people to the Constitution.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n",
+       " Document(page_content='And with an unwavering resolve that freedom will always triumph over tyranny.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[:5]"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "24e577e5",
+   "id": "8ca8a648",
   "metadata": {},
   "outputs": [],
   "source": []
@@ -64,7 +174,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,