docs: improve document loaders index (#25365)

Co-authored-by: Erick Friis <erick@langchain.dev>
2025-08-06 11:37:12 +00:00 · 2024-08-13 18:48:48 -07:00 · 2024-08-13 18:48:48 -07:00 · 967b6f21f6
commit 967b6f21f6
parent 4a78be7861
5 changed files with 151 additions and 10 deletions
--- a/docs/docs/integrations/document_loaders/index.mdx
+++ b/docs/docs/integrations/document_loaders/index.mdx
@ -7,6 +7,39 @@ sidebar_class_name: hidden

 import { CategoryTable, IndexTable } from "@theme/FeatureTables";

+DocumentLoaders load data into the standard LangChain Document format.
+
+Each DocumentLoader has its own specific parameters, but they can all be invoked in the same way with the .load method.
+An example use case is as follows:
+
+```python
+from langchain_community.document_loaders.csv_loader import CSVLoader
+
+loader = CSVLoader(
+    ...  # <-- Integration specific parameters here
+)
+data = loader.load()
+```
+
+## Common File Types
+
+The below document loaders allow you to load data from common data formats.
+
+<CategoryTable category="common_loaders" />
+
+## PDFs
+
+The below document loaders allow you to load documents.
+
+<CategoryTable category="pdf_loaders" />
+
+## Webpages
+
+The below document loaders allow you to load webpages.
+
+<CategoryTable category="webpage_loaders" />
+
+
 ## All document loaders

 <IndexTable />
--- a/docs/docs/integrations/document_loaders/pypdfloader.ipynb
+++ b/docs/docs/integrations/document_loaders/pypdfloader.ipynb
@ -6,7 +6,7 @@
   "source": [
    "# PyPDFLoader\n",
    "\n",
-    "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
+    "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
    "\n",
    "\n",
    "## Overview\n",
@ -43,7 +43,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "%pip install -qU langchain_community"
+    "%pip install -qU langchain_community pypdf"
   ]
  },
  {
@ -180,7 +180,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -194,9 +194,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.1"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/docs/docs/integrations/document_loaders/web_base.ipynb
+++ b/docs/docs/integrations/document_loaders/web_base.ipynb
@ -44,7 +44,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "%pip install -qU langchain_community"
+    "%pip install -qU langchain_community beautifulsoup4"
   ]
  },
  {
@ -330,7 +330,10 @@
   "cell_type": "markdown",
   "id": "672264ad",
   "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
   },
   "source": [
    "## Using proxies\n",
@ -343,7 +346,10 @@
   "execution_count": null,
   "id": "9caf0310",
   "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
   },
   "outputs": [],
   "source": [
@ -384,7 +390,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.1"
  }
 },
 "nbformat": 4,
--- a/docs/src/theme/FeatureTables.js
+++ b/docs/src/theme/FeatureTables.js
@ -440,6 +440,108 @@ const FEATURE_TABLES = {
        columns: [],
        items: [],
    },
+    webpage_loaders: {
+        link: 'docs/integrations/loaders',
+        columns: [
+            {title: "Document Loader", formatter: (item) => <a href={
+                item.link
+            }>{item.name}</a>},
+            {title: "Description", formatter: (item) => item.source},
+            {title: "Package/API", formatter: (item) => item.api},
+        ],
+        items: [
+            {
+                name: "Web",
+                link: "web_base",
+                source: "Uses urllib and BeautifulSoup to load and parse HTML web pages",
+                api: "Package",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html"
+            },
+            {
+                name: "RecursiveURL",
+                link: "recursive_url",
+                source: "Recursively scrapes all child links from a root URL",
+                api: "Package",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.recursive_url_loader.RecursiveUrlLoader.html"
+            },
+            {
+                name: "Sitemap",
+                link: "sitemap",
+                source: "Scrapes all pages on a given sitemap",
+                api: "Package",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.sitemap.SitemapLoader.html"
+            },
+            {
+                name: "Firecrawl",
+                link: "firecrawl",
+                source: "API service that can be deployed locally, hosted version has free credits.",
+                api: "API",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.firecrawl.FireCrawlLoader.html"
+            }
+        ]
+    },
+    pdf_loaders: {
+        link: 'docs/integrations/loaders',
+        columns: [
+            {title: "Document Loader", formatter: (item) => <a href={
+                item.link
+            }>{item.name}</a>},
+            {title: "Description", formatter: (item) => item.source},
+            {title: "Package/API", formatter: (item) => item.api},
+        ],
+        items: [
+            {
+                name: "PyPDF",
+                link: "pypdfloader",
+                source: "Uses `pypdf` to load and parse PDFs",
+                api: "Package",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html"
+            },
+            {
+                name: "Unstructured",
+                link: "unstructured_file",
+                source: "Uses Unstructured's open source library to load PDFs",
+                api: "Package",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
+            },
+            {
+                name: "Amazon Textract",
+                link: "amazon_textract",
+                source: "Uses AWS API to load PDFs",
+                api: "API",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
+            }
+        ]
+    },
+    common_loaders: {
+        link: 'docs/integrations/loaders',
+        columns: [
+            {title: "Document Loader", formatter: (item) => <a href={
+                item.link
+            }>{item.name}</a>},
+            {title: "Data Type", formatter: (item) => item.source},
+        ],
+        items: [
+            {
+                name: "CSVLoader",
+                link: "csv",
+                source: "CSV files",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.csv_loader.CSVLoader.html"
+            },
+            {
+                name: "DirectoryLoader",
+                link: "document_loader_directory",
+                source: "All files in a given directory",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html"
+            },
+            {
+                name: "Unstructured",
+                link: "unstructured_file",
+                source: "All file types",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
+            },
+        ]
+    },
    vectorstores: {
        link: 'docs/integrations/vectorstores',
        columns: [
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -237,7 +237,7 @@ class PyPDFLoader(BasePDFLoader):
            import pypdf  # noqa:F401
        except ImportError:
            raise ImportError(
-                "pypdf package not found, please install it with " "`pip install pypdf`"
+                "pypdf package not found, please install it with `pip install pypdf`"
            )
        super().__init__(file_path, headers=headers)
        self.parser = PyPDFParser(