mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-18 21:09:00 +00:00
docs: improve document loaders index (#25365)
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
4a78be7861
commit
967b6f21f6
@ -7,6 +7,39 @@ sidebar_class_name: hidden
|
||||
|
||||
import { CategoryTable, IndexTable } from "@theme/FeatureTables";
|
||||
|
||||
DocumentLoaders load data into the standard LangChain Document format.
|
||||
|
||||
Each DocumentLoader has its own specific parameters, but they can all be invoked in the same way with the .load method.
|
||||
An example use case is as follows:
|
||||
|
||||
```python
|
||||
from langchain_community.document_loaders.csv_loader import CSVLoader
|
||||
|
||||
loader = CSVLoader(
|
||||
... # <-- Integration specific parameters here
|
||||
)
|
||||
data = loader.load()
|
||||
```
|
||||
|
||||
## Common File Types
|
||||
|
||||
The below document loaders allow you to load data from common data formats.
|
||||
|
||||
<CategoryTable category="common_loaders" />
|
||||
|
||||
## PDFs
|
||||
|
||||
The below document loaders allow you to load documents.
|
||||
|
||||
<CategoryTable category="pdf_loaders" />
|
||||
|
||||
## Webpages
|
||||
|
||||
The below document loaders allow you to load webpages.
|
||||
|
||||
<CategoryTable category="webpage_loaders" />
|
||||
|
||||
|
||||
## All document loaders
|
||||
|
||||
<IndexTable />
|
||||
|
@ -6,7 +6,7 @@
|
||||
"source": [
|
||||
"# PyPDFLoader\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
|
||||
"This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
@ -43,7 +43,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
"%pip install -qU langchain_community pypdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -180,7 +180,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -194,9 +194,9 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.10.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
@ -44,7 +44,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
"%pip install -qU langchain_community beautifulsoup4"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -330,7 +330,10 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "672264ad",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Using proxies\n",
|
||||
@ -343,7 +346,10 @@
|
||||
"execution_count": null,
|
||||
"id": "9caf0310",
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"jupyter": {
|
||||
"outputs_hidden": false
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -384,7 +390,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.10.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -440,6 +440,108 @@ const FEATURE_TABLES = {
|
||||
columns: [],
|
||||
items: [],
|
||||
},
|
||||
webpage_loaders: {
|
||||
link: 'docs/integrations/loaders',
|
||||
columns: [
|
||||
{title: "Document Loader", formatter: (item) => <a href={
|
||||
item.link
|
||||
}>{item.name}</a>},
|
||||
{title: "Description", formatter: (item) => item.source},
|
||||
{title: "Package/API", formatter: (item) => item.api},
|
||||
],
|
||||
items: [
|
||||
{
|
||||
name: "Web",
|
||||
link: "web_base",
|
||||
source: "Uses urllib and BeautifulSoup to load and parse HTML web pages",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html"
|
||||
},
|
||||
{
|
||||
name: "RecursiveURL",
|
||||
link: "recursive_url",
|
||||
source: "Recursively scrapes all child links from a root URL",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.recursive_url_loader.RecursiveUrlLoader.html"
|
||||
},
|
||||
{
|
||||
name: "Sitemap",
|
||||
link: "sitemap",
|
||||
source: "Scrapes all pages on a given sitemap",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.sitemap.SitemapLoader.html"
|
||||
},
|
||||
{
|
||||
name: "Firecrawl",
|
||||
link: "firecrawl",
|
||||
source: "API service that can be deployed locally, hosted version has free credits.",
|
||||
api: "API",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.firecrawl.FireCrawlLoader.html"
|
||||
}
|
||||
]
|
||||
},
|
||||
pdf_loaders: {
|
||||
link: 'docs/integrations/loaders',
|
||||
columns: [
|
||||
{title: "Document Loader", formatter: (item) => <a href={
|
||||
item.link
|
||||
}>{item.name}</a>},
|
||||
{title: "Description", formatter: (item) => item.source},
|
||||
{title: "Package/API", formatter: (item) => item.api},
|
||||
],
|
||||
items: [
|
||||
{
|
||||
name: "PyPDF",
|
||||
link: "pypdfloader",
|
||||
source: "Uses `pypdf` to load and parse PDFs",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html"
|
||||
},
|
||||
{
|
||||
name: "Unstructured",
|
||||
link: "unstructured_file",
|
||||
source: "Uses Unstructured's open source library to load PDFs",
|
||||
api: "Package",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
|
||||
},
|
||||
{
|
||||
name: "Amazon Textract",
|
||||
link: "amazon_textract",
|
||||
source: "Uses AWS API to load PDFs",
|
||||
api: "API",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
|
||||
}
|
||||
]
|
||||
},
|
||||
common_loaders: {
|
||||
link: 'docs/integrations/loaders',
|
||||
columns: [
|
||||
{title: "Document Loader", formatter: (item) => <a href={
|
||||
item.link
|
||||
}>{item.name}</a>},
|
||||
{title: "Data Type", formatter: (item) => item.source},
|
||||
],
|
||||
items: [
|
||||
{
|
||||
name: "CSVLoader",
|
||||
link: "csv",
|
||||
source: "CSV files",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.csv_loader.CSVLoader.html"
|
||||
},
|
||||
{
|
||||
name: "DirectoryLoader",
|
||||
link: "document_loader_directory",
|
||||
source: "All files in a given directory",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html"
|
||||
},
|
||||
{
|
||||
name: "Unstructured",
|
||||
link: "unstructured_file",
|
||||
source: "All file types",
|
||||
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
|
||||
},
|
||||
]
|
||||
},
|
||||
vectorstores: {
|
||||
link: 'docs/integrations/vectorstores',
|
||||
columns: [
|
||||
|
@ -237,7 +237,7 @@ class PyPDFLoader(BasePDFLoader):
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||
"pypdf package not found, please install it with `pip install pypdf`"
|
||||
)
|
||||
super().__init__(file_path, headers=headers)
|
||||
self.parser = PyPDFParser(
|
||||
|
Loading…
Reference in New Issue
Block a user