diff --git a/docs/docs/integrations/document_loaders/index.mdx b/docs/docs/integrations/document_loaders/index.mdx
index 32bb8e4f74e..ea81e733f6a 100644
--- a/docs/docs/integrations/document_loaders/index.mdx
+++ b/docs/docs/integrations/document_loaders/index.mdx
@@ -7,6 +7,39 @@ sidebar_class_name: hidden
import { CategoryTable, IndexTable } from "@theme/FeatureTables";
+DocumentLoaders load data into the standard LangChain Document format.
+
+Each DocumentLoader has its own specific parameters, but they can all be invoked in the same way with the .load method.
+An example use case is as follows:
+
+```python
+from langchain_community.document_loaders.csv_loader import CSVLoader
+
+loader = CSVLoader(
+ ... # <-- Integration specific parameters here
+)
+data = loader.load()
+```
+
+## Common File Types
+
+The below document loaders allow you to load data from common data formats.
+
+
+
+## PDFs
+
+The below document loaders allow you to load documents.
+
+
+
+## Webpages
+
+The below document loaders allow you to load webpages.
+
+
+
+
## All document loaders
diff --git a/docs/docs/integrations/document_loaders/pypdfloader.ipynb b/docs/docs/integrations/document_loaders/pypdfloader.ipynb
index 9debf5cf183..27a1675fe3a 100644
--- a/docs/docs/integrations/document_loaders/pypdfloader.ipynb
+++ b/docs/docs/integrations/document_loaders/pypdfloader.ipynb
@@ -6,7 +6,7 @@
"source": [
"# PyPDFLoader\n",
"\n",
- "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
+ "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
"\n",
"\n",
"## Overview\n",
@@ -43,7 +43,7 @@
"metadata": {},
"outputs": [],
"source": [
- "%pip install -qU langchain_community"
+ "%pip install -qU langchain_community pypdf"
]
},
{
@@ -180,7 +180,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -194,9 +194,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.9"
+ "version": "3.10.1"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/docs/docs/integrations/document_loaders/web_base.ipynb b/docs/docs/integrations/document_loaders/web_base.ipynb
index e41c40244e1..b6eabc17b1c 100644
--- a/docs/docs/integrations/document_loaders/web_base.ipynb
+++ b/docs/docs/integrations/document_loaders/web_base.ipynb
@@ -44,7 +44,7 @@
"metadata": {},
"outputs": [],
"source": [
- "%pip install -qU langchain_community"
+ "%pip install -qU langchain_community beautifulsoup4"
]
},
{
@@ -330,7 +330,10 @@
"cell_type": "markdown",
"id": "672264ad",
"metadata": {
- "collapsed": false
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
},
"source": [
"## Using proxies\n",
@@ -343,7 +346,10 @@
"execution_count": null,
"id": "9caf0310",
"metadata": {
- "collapsed": false
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ }
},
"outputs": [],
"source": [
@@ -384,7 +390,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.9"
+ "version": "3.10.1"
}
},
"nbformat": 4,
diff --git a/docs/src/theme/FeatureTables.js b/docs/src/theme/FeatureTables.js
index f03854158cf..e390f35a877 100644
--- a/docs/src/theme/FeatureTables.js
+++ b/docs/src/theme/FeatureTables.js
@@ -440,6 +440,108 @@ const FEATURE_TABLES = {
columns: [],
items: [],
},
+ webpage_loaders: {
+ link: 'docs/integrations/loaders',
+ columns: [
+ {title: "Document Loader", formatter: (item) => {item.name}},
+ {title: "Description", formatter: (item) => item.source},
+ {title: "Package/API", formatter: (item) => item.api},
+ ],
+ items: [
+ {
+ name: "Web",
+ link: "web_base",
+ source: "Uses urllib and BeautifulSoup to load and parse HTML web pages",
+ api: "Package",
+ apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html"
+ },
+ {
+ name: "RecursiveURL",
+ link: "recursive_url",
+ source: "Recursively scrapes all child links from a root URL",
+ api: "Package",
+ apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.recursive_url_loader.RecursiveUrlLoader.html"
+ },
+ {
+ name: "Sitemap",
+ link: "sitemap",
+ source: "Scrapes all pages on a given sitemap",
+ api: "Package",
+ apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.sitemap.SitemapLoader.html"
+ },
+ {
+ name: "Firecrawl",
+ link: "firecrawl",
+ source: "API service that can be deployed locally, hosted version has free credits.",
+ api: "API",
+ apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.firecrawl.FireCrawlLoader.html"
+ }
+ ]
+ },
+ pdf_loaders: {
+ link: 'docs/integrations/loaders',
+ columns: [
+ {title: "Document Loader", formatter: (item) => {item.name}},
+ {title: "Description", formatter: (item) => item.source},
+ {title: "Package/API", formatter: (item) => item.api},
+ ],
+ items: [
+ {
+ name: "PyPDF",
+ link: "pypdfloader",
+ source: "Uses `pypdf` to load and parse PDFs",
+ api: "Package",
+ apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html"
+ },
+ {
+ name: "Unstructured",
+ link: "unstructured_file",
+ source: "Uses Unstructured's open source library to load PDFs",
+ api: "Package",
+ apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
+ },
+ {
+ name: "Amazon Textract",
+ link: "amazon_textract",
+ source: "Uses AWS API to load PDFs",
+ api: "API",
+ apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
+ }
+ ]
+ },
+ common_loaders: {
+ link: 'docs/integrations/loaders',
+ columns: [
+ {title: "Document Loader", formatter: (item) => {item.name}},
+ {title: "Data Type", formatter: (item) => item.source},
+ ],
+ items: [
+ {
+ name: "CSVLoader",
+ link: "csv",
+ source: "CSV files",
+ apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.csv_loader.CSVLoader.html"
+ },
+ {
+ name: "DirectoryLoader",
+ link: "document_loader_directory",
+ source: "All files in a given directory",
+ apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html"
+ },
+ {
+ name: "Unstructured",
+ link: "unstructured_file",
+ source: "All file types",
+ apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
+ },
+ ]
+ },
vectorstores: {
link: 'docs/integrations/vectorstores',
columns: [
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
index b7fbf575045..6e9c70ca1af 100644
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@@ -237,7 +237,7 @@ class PyPDFLoader(BasePDFLoader):
import pypdf # noqa:F401
except ImportError:
raise ImportError(
- "pypdf package not found, please install it with " "`pip install pypdf`"
+ "pypdf package not found, please install it with `pip install pypdf`"
)
super().__init__(file_path, headers=headers)
self.parser = PyPDFParser(