From 967b6f21f6462f16f0ba4dc38ed914ec4ca550ce Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Tue, 13 Aug 2024 18:48:48 -0700 Subject: [PATCH] docs: improve document loaders index (#25365) Co-authored-by: Erick Friis --- .../integrations/document_loaders/index.mdx | 33 ++++++ .../document_loaders/pypdfloader.ipynb | 10 +- .../document_loaders/web_base.ipynb | 14 ++- docs/src/theme/FeatureTables.js | 102 ++++++++++++++++++ .../document_loaders/pdf.py | 2 +- 5 files changed, 151 insertions(+), 10 deletions(-) diff --git a/docs/docs/integrations/document_loaders/index.mdx b/docs/docs/integrations/document_loaders/index.mdx index 32bb8e4f74e..ea81e733f6a 100644 --- a/docs/docs/integrations/document_loaders/index.mdx +++ b/docs/docs/integrations/document_loaders/index.mdx @@ -7,6 +7,39 @@ sidebar_class_name: hidden import { CategoryTable, IndexTable } from "@theme/FeatureTables"; +DocumentLoaders load data into the standard LangChain Document format. + +Each DocumentLoader has its own specific parameters, but they can all be invoked in the same way with the .load method. +An example use case is as follows: + +```python +from langchain_community.document_loaders.csv_loader import CSVLoader + +loader = CSVLoader( + ... # <-- Integration specific parameters here +) +data = loader.load() +``` + +## Common File Types + +The below document loaders allow you to load data from common data formats. + + + +## PDFs + +The below document loaders allow you to load documents. + + + +## Webpages + +The below document loaders allow you to load webpages. + + + + ## All document loaders diff --git a/docs/docs/integrations/document_loaders/pypdfloader.ipynb b/docs/docs/integrations/document_loaders/pypdfloader.ipynb index 9debf5cf183..27a1675fe3a 100644 --- a/docs/docs/integrations/document_loaders/pypdfloader.ipynb +++ b/docs/docs/integrations/document_loaders/pypdfloader.ipynb @@ -6,7 +6,7 @@ "source": [ "# PyPDFLoader\n", "\n", - "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n", + "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n", "\n", "\n", "## Overview\n", @@ -43,7 +43,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -qU langchain_community" + "%pip install -qU langchain_community pypdf" ] }, { @@ -180,7 +180,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -194,9 +194,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.1" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/docs/integrations/document_loaders/web_base.ipynb b/docs/docs/integrations/document_loaders/web_base.ipynb index e41c40244e1..b6eabc17b1c 100644 --- a/docs/docs/integrations/document_loaders/web_base.ipynb +++ b/docs/docs/integrations/document_loaders/web_base.ipynb @@ -44,7 +44,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -qU langchain_community" + "%pip install -qU langchain_community beautifulsoup4" ] }, { @@ -330,7 +330,10 @@ "cell_type": "markdown", "id": "672264ad", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "source": [ "## Using proxies\n", @@ -343,7 +346,10 @@ "execution_count": null, "id": "9caf0310", "metadata": { - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, "outputs": [], "source": [ @@ -384,7 +390,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.1" } }, "nbformat": 4, diff --git a/docs/src/theme/FeatureTables.js b/docs/src/theme/FeatureTables.js index f03854158cf..e390f35a877 100644 --- a/docs/src/theme/FeatureTables.js +++ b/docs/src/theme/FeatureTables.js @@ -440,6 +440,108 @@ const FEATURE_TABLES = { columns: [], items: [], }, + webpage_loaders: { + link: 'docs/integrations/loaders', + columns: [ + {title: "Document Loader", formatter: (item) => {item.name}}, + {title: "Description", formatter: (item) => item.source}, + {title: "Package/API", formatter: (item) => item.api}, + ], + items: [ + { + name: "Web", + link: "web_base", + source: "Uses urllib and BeautifulSoup to load and parse HTML web pages", + api: "Package", + apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html" + }, + { + name: "RecursiveURL", + link: "recursive_url", + source: "Recursively scrapes all child links from a root URL", + api: "Package", + apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.recursive_url_loader.RecursiveUrlLoader.html" + }, + { + name: "Sitemap", + link: "sitemap", + source: "Scrapes all pages on a given sitemap", + api: "Package", + apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.sitemap.SitemapLoader.html" + }, + { + name: "Firecrawl", + link: "firecrawl", + source: "API service that can be deployed locally, hosted version has free credits.", + api: "API", + apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.firecrawl.FireCrawlLoader.html" + } + ] + }, + pdf_loaders: { + link: 'docs/integrations/loaders', + columns: [ + {title: "Document Loader", formatter: (item) => {item.name}}, + {title: "Description", formatter: (item) => item.source}, + {title: "Package/API", formatter: (item) => item.api}, + ], + items: [ + { + name: "PyPDF", + link: "pypdfloader", + source: "Uses `pypdf` to load and parse PDFs", + api: "Package", + apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html" + }, + { + name: "Unstructured", + link: "unstructured_file", + source: "Uses Unstructured's open source library to load PDFs", + api: "Package", + apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html" + }, + { + name: "Amazon Textract", + link: "amazon_textract", + source: "Uses AWS API to load PDFs", + api: "API", + apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html" + } + ] + }, + common_loaders: { + link: 'docs/integrations/loaders', + columns: [ + {title: "Document Loader", formatter: (item) => {item.name}}, + {title: "Data Type", formatter: (item) => item.source}, + ], + items: [ + { + name: "CSVLoader", + link: "csv", + source: "CSV files", + apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.csv_loader.CSVLoader.html" + }, + { + name: "DirectoryLoader", + link: "document_loader_directory", + source: "All files in a given directory", + apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html" + }, + { + name: "Unstructured", + link: "unstructured_file", + source: "All file types", + apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html" + }, + ] + }, vectorstores: { link: 'docs/integrations/vectorstores', columns: [ diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index b7fbf575045..6e9c70ca1af 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -237,7 +237,7 @@ class PyPDFLoader(BasePDFLoader): import pypdf # noqa:F401 except ImportError: raise ImportError( - "pypdf package not found, please install it with " "`pip install pypdf`" + "pypdf package not found, please install it with `pip install pypdf`" ) super().__init__(file_path, headers=headers) self.parser = PyPDFParser(