docs: improve document loaders index (#25365)

Co-authored-by: Erick Friis <erick@langchain.dev>
2025-08-05 19:15:44 +00:00 · 2024-08-13 18:48:48 -07:00 · 2024-08-13 18:48:48 -07:00 · 967b6f21f6
commit 967b6f21f6
parent 4a78be7861
5 changed files with 151 additions and 10 deletions
--- a/docs/docs/integrations/document_loaders/index.mdx
+++ b/docs/docs/integrations/document_loaders/index.mdx
@ -7,6 +7,39 @@ sidebar_class_name: hidden
 import { CategoryTable, IndexTable } from "@theme/FeatureTables";
 DocumentLoaders load data into the standard LangChain Document format.
 Each DocumentLoader has its own specific parameters, but they can all be invoked in the same way with the .load method.
 An example use case is as follows:
 ```python
 from langchain_community.document_loaders.csv_loader import CSVLoader
 loader = CSVLoader(
    ...  # <-- Integration specific parameters here
 )
 data = loader.load()
 ```
 ## Common File Types
 The below document loaders allow you to load data from common data formats.
 <CategoryTable category="common_loaders" />
 ## PDFs
 The below document loaders allow you to load documents.
 <CategoryTable category="pdf_loaders" />
 ## Webpages
 The below document loaders allow you to load webpages.
 <CategoryTable category="webpage_loaders" />
 ## All document loaders
 <IndexTable />
--- a/docs/docs/integrations/document_loaders/pypdfloader.ipynb
+++ b/docs/docs/integrations/document_loaders/pypdfloader.ipynb
@ -6,7 +6,7 @@
   "source": [
    "# PyPDFLoader\n",
    "\n",
-    "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
+    "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
    "\n",
    "\n",
    "## Overview\n",
@ -43,7 +43,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "%pip install -qU langchain_community"
+    "%pip install -qU langchain_community pypdf"
   ]
  },
  {
@ -180,7 +180,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -194,9 +194,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.1"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/docs/docs/integrations/document_loaders/web_base.ipynb
+++ b/docs/docs/integrations/document_loaders/web_base.ipynb
@ -44,7 +44,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "%pip install -qU langchain_community"
+    "%pip install -qU langchain_community beautifulsoup4"
   ]
  },
  {
@ -330,7 +330,10 @@
   "cell_type": "markdown",
   "id": "672264ad",
   "metadata": {
-    "collapsed": false
+    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "source": [
    "## Using proxies\n",
@ -343,7 +346,10 @@
   "execution_count": null,
   "id": "9caf0310",
   "metadata": {
-    "collapsed": false
+    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
@ -384,7 +390,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.1"
  }
 },
 "nbformat": 4,
--- a/docs/src/theme/FeatureTables.js
+++ b/docs/src/theme/FeatureTables.js
@ -440,6 +440,108 @@ const FEATURE_TABLES = {
        columns: [],
        items: [],
    },
    webpage_loaders: {
        link: 'docs/integrations/loaders',
        columns: [
            {title: "Document Loader", formatter: (item) => <a href={
                item.link
            }>{item.name}</a>},
            {title: "Description", formatter: (item) => item.source},
            {title: "Package/API", formatter: (item) => item.api},
        ],
        items: [
            {
                name: "Web",
                link: "web_base",
                source: "Uses urllib and BeautifulSoup to load and parse HTML web pages",
                api: "Package",
                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html"
            },
            {
                name: "RecursiveURL",
                link: "recursive_url",
                source: "Recursively scrapes all child links from a root URL",
                api: "Package",
                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.recursive_url_loader.RecursiveUrlLoader.html"
            },
            {
                name: "Sitemap",
                link: "sitemap",
                source: "Scrapes all pages on a given sitemap",
                api: "Package",
                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.sitemap.SitemapLoader.html"
            },
            {
                name: "Firecrawl",
                link: "firecrawl",
                source: "API service that can be deployed locally, hosted version has free credits.",
                api: "API",
                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.firecrawl.FireCrawlLoader.html"
            }
        ]
    },
    pdf_loaders: {
        link: 'docs/integrations/loaders',
        columns: [
            {title: "Document Loader", formatter: (item) => <a href={
                item.link
            }>{item.name}</a>},
            {title: "Description", formatter: (item) => item.source},
            {title: "Package/API", formatter: (item) => item.api},
        ],
        items: [
            {
                name: "PyPDF",
                link: "pypdfloader",
                source: "Uses `pypdf` to load and parse PDFs",
                api: "Package",
                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html"
            },
            {
                name: "Unstructured",
                link: "unstructured_file",
                source: "Uses Unstructured's open source library to load PDFs",
                api: "Package",
                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
            },
            {
                name: "Amazon Textract",
                link: "amazon_textract",
                source: "Uses AWS API to load PDFs",
                api: "API",
                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
            }
        ]
    },
    common_loaders: {
        link: 'docs/integrations/loaders',
        columns: [
            {title: "Document Loader", formatter: (item) => <a href={
                item.link
            }>{item.name}</a>},
            {title: "Data Type", formatter: (item) => item.source},
        ],
        items: [
            {
                name: "CSVLoader",
                link: "csv",
                source: "CSV files",
                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.csv_loader.CSVLoader.html"
            },
            {
                name: "DirectoryLoader",
                link: "document_loader_directory",
                source: "All files in a given directory",
                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html"
            },
            {
                name: "Unstructured",
                link: "unstructured_file",
                source: "All file types",
                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
            },
        ]
    },
    vectorstores: {
        link: 'docs/integrations/vectorstores',
        columns: [
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@ -237,7 +237,7 @@ class PyPDFLoader(BasePDFLoader):
            import pypdf  # noqa:F401
        except ImportError:
            raise ImportError(
-                "pypdf package not found, please install it with " "`pip install pypdf`"
+                "pypdf package not found, please install it with `pip install pypdf`"
            )
        super().__init__(file_path, headers=headers)
        self.parser = PyPDFParser(