docs: improve document loaders index (#25365)

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Harrison Chase 2024-08-13 18:48:48 -07:00 committed by GitHub
parent 4a78be7861
commit 967b6f21f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 151 additions and 10 deletions

View File

@ -7,6 +7,39 @@ sidebar_class_name: hidden
import { CategoryTable, IndexTable } from "@theme/FeatureTables";
DocumentLoaders load data into the standard LangChain Document format.
Each DocumentLoader has its own specific parameters, but they can all be invoked in the same way with the .load method.
An example use case is as follows:
```python
from langchain_community.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(
... # <-- Integration specific parameters here
)
data = loader.load()
```
## Common File Types
The below document loaders allow you to load data from common data formats.
<CategoryTable category="common_loaders" />
## PDFs
The below document loaders allow you to load documents.
<CategoryTable category="pdf_loaders" />
## Webpages
The below document loaders allow you to load webpages.
<CategoryTable category="webpage_loaders" />
## All document loaders
<IndexTable />

View File

@ -6,7 +6,7 @@
"source": [
"# PyPDFLoader\n",
"\n",
"This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
"This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
"\n",
"\n",
"## Overview\n",
@ -43,7 +43,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU langchain_community"
"%pip install -qU langchain_community pypdf"
]
},
{
@ -180,7 +180,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -194,9 +194,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.10.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

View File

@ -44,7 +44,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU langchain_community"
"%pip install -qU langchain_community beautifulsoup4"
]
},
{
@ -330,7 +330,10 @@
"cell_type": "markdown",
"id": "672264ad",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Using proxies\n",
@ -343,7 +346,10 @@
"execution_count": null,
"id": "9caf0310",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"outputs": [],
"source": [
@ -384,7 +390,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.10.1"
}
},
"nbformat": 4,

View File

@ -440,6 +440,108 @@ const FEATURE_TABLES = {
columns: [],
items: [],
},
webpage_loaders: {
link: 'docs/integrations/loaders',
columns: [
{title: "Document Loader", formatter: (item) => <a href={
item.link
}>{item.name}</a>},
{title: "Description", formatter: (item) => item.source},
{title: "Package/API", formatter: (item) => item.api},
],
items: [
{
name: "Web",
link: "web_base",
source: "Uses urllib and BeautifulSoup to load and parse HTML web pages",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html"
},
{
name: "RecursiveURL",
link: "recursive_url",
source: "Recursively scrapes all child links from a root URL",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.recursive_url_loader.RecursiveUrlLoader.html"
},
{
name: "Sitemap",
link: "sitemap",
source: "Scrapes all pages on a given sitemap",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.sitemap.SitemapLoader.html"
},
{
name: "Firecrawl",
link: "firecrawl",
source: "API service that can be deployed locally, hosted version has free credits.",
api: "API",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.firecrawl.FireCrawlLoader.html"
}
]
},
pdf_loaders: {
link: 'docs/integrations/loaders',
columns: [
{title: "Document Loader", formatter: (item) => <a href={
item.link
}>{item.name}</a>},
{title: "Description", formatter: (item) => item.source},
{title: "Package/API", formatter: (item) => item.api},
],
items: [
{
name: "PyPDF",
link: "pypdfloader",
source: "Uses `pypdf` to load and parse PDFs",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html"
},
{
name: "Unstructured",
link: "unstructured_file",
source: "Uses Unstructured's open source library to load PDFs",
api: "Package",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
},
{
name: "Amazon Textract",
link: "amazon_textract",
source: "Uses AWS API to load PDFs",
api: "API",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
}
]
},
common_loaders: {
link: 'docs/integrations/loaders',
columns: [
{title: "Document Loader", formatter: (item) => <a href={
item.link
}>{item.name}</a>},
{title: "Data Type", formatter: (item) => item.source},
],
items: [
{
name: "CSVLoader",
link: "csv",
source: "CSV files",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.csv_loader.CSVLoader.html"
},
{
name: "DirectoryLoader",
link: "document_loader_directory",
source: "All files in a given directory",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html"
},
{
name: "Unstructured",
link: "unstructured_file",
source: "All file types",
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
},
]
},
vectorstores: {
link: 'docs/integrations/vectorstores',
columns: [

View File

@ -237,7 +237,7 @@ class PyPDFLoader(BasePDFLoader):
import pypdf # noqa:F401
except ImportError:
raise ImportError(
"pypdf package not found, please install it with " "`pip install pypdf`"
"pypdf package not found, please install it with `pip install pypdf`"
)
super().__init__(file_path, headers=headers)
self.parser = PyPDFParser(