mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 13:23:35 +00:00
docs: improve document loaders index (#25365)
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
4a78be7861
commit
967b6f21f6
@ -7,6 +7,39 @@ sidebar_class_name: hidden
|
|||||||
|
|
||||||
import { CategoryTable, IndexTable } from "@theme/FeatureTables";
|
import { CategoryTable, IndexTable } from "@theme/FeatureTables";
|
||||||
|
|
||||||
|
DocumentLoaders load data into the standard LangChain Document format.
|
||||||
|
|
||||||
|
Each DocumentLoader has its own specific parameters, but they can all be invoked in the same way with the .load method.
|
||||||
|
An example use case is as follows:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain_community.document_loaders.csv_loader import CSVLoader
|
||||||
|
|
||||||
|
loader = CSVLoader(
|
||||||
|
... # <-- Integration specific parameters here
|
||||||
|
)
|
||||||
|
data = loader.load()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common File Types
|
||||||
|
|
||||||
|
The below document loaders allow you to load data from common data formats.
|
||||||
|
|
||||||
|
<CategoryTable category="common_loaders" />
|
||||||
|
|
||||||
|
## PDFs
|
||||||
|
|
||||||
|
The below document loaders allow you to load documents.
|
||||||
|
|
||||||
|
<CategoryTable category="pdf_loaders" />
|
||||||
|
|
||||||
|
## Webpages
|
||||||
|
|
||||||
|
The below document loaders allow you to load webpages.
|
||||||
|
|
||||||
|
<CategoryTable category="webpage_loaders" />
|
||||||
|
|
||||||
|
|
||||||
## All document loaders
|
## All document loaders
|
||||||
|
|
||||||
<IndexTable />
|
<IndexTable />
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# PyPDFLoader\n",
|
"# PyPDFLoader\n",
|
||||||
"\n",
|
"\n",
|
||||||
"This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
|
"This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"## Overview\n",
|
"## Overview\n",
|
||||||
@ -43,7 +43,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"%pip install -qU langchain_community"
|
"%pip install -qU langchain_community pypdf"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -180,7 +180,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -194,9 +194,9 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.9"
|
"version": "3.10.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 2
|
"nbformat_minor": 4
|
||||||
}
|
}
|
||||||
|
@ -44,7 +44,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"%pip install -qU langchain_community"
|
"%pip install -qU langchain_community beautifulsoup4"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -330,7 +330,10 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "672264ad",
|
"id": "672264ad",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"## Using proxies\n",
|
"## Using proxies\n",
|
||||||
@ -343,7 +346,10 @@
|
|||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "9caf0310",
|
"id": "9caf0310",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -384,7 +390,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.9"
|
"version": "3.10.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -440,6 +440,108 @@ const FEATURE_TABLES = {
|
|||||||
columns: [],
|
columns: [],
|
||||||
items: [],
|
items: [],
|
||||||
},
|
},
|
||||||
|
webpage_loaders: {
|
||||||
|
link: 'docs/integrations/loaders',
|
||||||
|
columns: [
|
||||||
|
{title: "Document Loader", formatter: (item) => <a href={
|
||||||
|
item.link
|
||||||
|
}>{item.name}</a>},
|
||||||
|
{title: "Description", formatter: (item) => item.source},
|
||||||
|
{title: "Package/API", formatter: (item) => item.api},
|
||||||
|
],
|
||||||
|
items: [
|
||||||
|
{
|
||||||
|
name: "Web",
|
||||||
|
link: "web_base",
|
||||||
|
source: "Uses urllib and BeautifulSoup to load and parse HTML web pages",
|
||||||
|
api: "Package",
|
||||||
|
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "RecursiveURL",
|
||||||
|
link: "recursive_url",
|
||||||
|
source: "Recursively scrapes all child links from a root URL",
|
||||||
|
api: "Package",
|
||||||
|
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.recursive_url_loader.RecursiveUrlLoader.html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Sitemap",
|
||||||
|
link: "sitemap",
|
||||||
|
source: "Scrapes all pages on a given sitemap",
|
||||||
|
api: "Package",
|
||||||
|
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.sitemap.SitemapLoader.html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Firecrawl",
|
||||||
|
link: "firecrawl",
|
||||||
|
source: "API service that can be deployed locally, hosted version has free credits.",
|
||||||
|
api: "API",
|
||||||
|
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.firecrawl.FireCrawlLoader.html"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
pdf_loaders: {
|
||||||
|
link: 'docs/integrations/loaders',
|
||||||
|
columns: [
|
||||||
|
{title: "Document Loader", formatter: (item) => <a href={
|
||||||
|
item.link
|
||||||
|
}>{item.name}</a>},
|
||||||
|
{title: "Description", formatter: (item) => item.source},
|
||||||
|
{title: "Package/API", formatter: (item) => item.api},
|
||||||
|
],
|
||||||
|
items: [
|
||||||
|
{
|
||||||
|
name: "PyPDF",
|
||||||
|
link: "pypdfloader",
|
||||||
|
source: "Uses `pypdf` to load and parse PDFs",
|
||||||
|
api: "Package",
|
||||||
|
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Unstructured",
|
||||||
|
link: "unstructured_file",
|
||||||
|
source: "Uses Unstructured's open source library to load PDFs",
|
||||||
|
api: "Package",
|
||||||
|
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Amazon Textract",
|
||||||
|
link: "amazon_textract",
|
||||||
|
source: "Uses AWS API to load PDFs",
|
||||||
|
api: "API",
|
||||||
|
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
common_loaders: {
|
||||||
|
link: 'docs/integrations/loaders',
|
||||||
|
columns: [
|
||||||
|
{title: "Document Loader", formatter: (item) => <a href={
|
||||||
|
item.link
|
||||||
|
}>{item.name}</a>},
|
||||||
|
{title: "Data Type", formatter: (item) => item.source},
|
||||||
|
],
|
||||||
|
items: [
|
||||||
|
{
|
||||||
|
name: "CSVLoader",
|
||||||
|
link: "csv",
|
||||||
|
source: "CSV files",
|
||||||
|
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.csv_loader.CSVLoader.html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "DirectoryLoader",
|
||||||
|
link: "document_loader_directory",
|
||||||
|
source: "All files in a given directory",
|
||||||
|
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "Unstructured",
|
||||||
|
link: "unstructured_file",
|
||||||
|
source: "All file types",
|
||||||
|
apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
|
||||||
|
},
|
||||||
|
]
|
||||||
|
},
|
||||||
vectorstores: {
|
vectorstores: {
|
||||||
link: 'docs/integrations/vectorstores',
|
link: 'docs/integrations/vectorstores',
|
||||||
columns: [
|
columns: [
|
||||||
|
@ -237,7 +237,7 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
import pypdf # noqa:F401
|
import pypdf # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
"pypdf package not found, please install it with `pip install pypdf`"
|
||||||
)
|
)
|
||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
self.parser = PyPDFParser(
|
self.parser = PyPDFParser(
|
||||||
|
Loading…
Reference in New Issue
Block a user