Vwp/docs improved document loaders (#4006)

Huge thanks to @leo-gan for improving the document loaders notebooks --------- Co-authored-by: Leonid Ganeline <leo.gan.57@gmail.com>
2025-10-08 05:41:56 +00:00 · 2023-05-02 15:24:53 -07:00
parent 1c68cbdb28
commit aa38355999
57 changed files with 1227 additions and 779 deletions
--- a/docs/modules/indexes/document_loaders/examples/html.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/html.ipynb
@@ -7,7 +7,7 @@
   "source": [
    "# HTML\n",
    "\n",
-    "This covers how to load HTML documents into a document format that we can use downstream."
+    "This covers how to load `HTML` documents into a document format that we can use downstream."
   ]
  },
  {
@@ -48,7 +48,9 @@
   "outputs": [
    {
     "data": {
-      "text/plain": "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]"
+      "text/plain": [
+       "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]"
+      ]
     },
     "execution_count": 4,
     "metadata": {},
@@ -61,20 +63,21 @@
  },
  {
   "cell_type": "markdown",
+   "id": "00337aae",
+   "metadata": {},
   "source": [
    "## Loading HTML with BeautifulSoup4\n",
    "\n",
-    "We can also use BeautifulSoup4 to load HTML documents using the `BSHTMLLoader`.  This will extract the text from the html into `page_content`, and the page title as `title` into `metadata`."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+    "We can also use `BeautifulSoup4` to load HTML documents using the `BSHTMLLoader`.  This will extract the text from the HTML into `page_content`, and the page title as `title` into `metadata`."
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 1,
   "id": "79b1bce4",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "outputs": [],
   "source": [
    "from langchain.document_loaders import BSHTMLLoader"
@@ -82,13 +85,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 2,
+   "id": "4be99e6c",
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "tags": []
+   },
   "outputs": [
    {
     "data": {
-      "text/plain": "[Document(page_content='\\n\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n', lookup_str='', metadata={'source': 'example_data/fake-content.html', 'title': 'Test Title'}, lookup_index=0)]"
+      "text/plain": [
+       "[Document(page_content='\\n\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n', metadata={'source': 'example_data/fake-content.html', 'title': 'Test Title'})]"
+      ]
     },
-     "execution_count": 17,
+     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -97,19 +110,7 @@
    "loader = BSHTMLLoader(\"example_data/fake-content.html\")\n",
    "data = loader.load()\n",
    "data"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
-   "source": [],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  }
 ],
 "metadata": {
@@ -128,7 +129,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.10.6"
  }
 },
 "nbformat": 4,