mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 15:04:13 +00:00
[docs]: standardize doc loader doc strings (#25325)
This commit is contained in:
@@ -6,11 +6,63 @@ from langchain_core.utils import get_from_env
|
||||
|
||||
|
||||
class FireCrawlLoader(BaseLoader):
|
||||
"""Load web pages as Documents using FireCrawl.
|
||||
|
||||
Must have Python package `firecrawl` installed and a FireCrawl API key. See
|
||||
https://www.firecrawl.dev/ for more.
|
||||
"""
|
||||
FireCrawlLoader document loader integration
|
||||
|
||||
Setup:
|
||||
Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U firecrawl-py langchain_community
|
||||
export FIRECRAWL_API_KEY="your-api-key"
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import FireCrawlLoader
|
||||
|
||||
loader = FireCrawlLoader(
|
||||
url = "https://firecrawl.dev",
|
||||
mode = "crawl"
|
||||
# other params = ...
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
|
||||
Join the waitlist to turn any web
|
||||
{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
|
||||
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
|
||||
Join the waitlist to turn any web
|
||||
{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@@ -161,10 +161,66 @@ class OnlinePDFLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class PyPDFLoader(BasePDFLoader):
|
||||
"""Load PDF using pypdf into list of documents.
|
||||
|
||||
Loader chunks by page and stores page numbers in metadata.
|
||||
"""
|
||||
PyPDFLoader document loader integration
|
||||
|
||||
Setup:
|
||||
Install ``langchain-community``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
|
||||
loader = PyPDFLoader(
|
||||
file_path = "./example_data/layout-parser-paper.pdf",
|
||||
password = "my-pasword",
|
||||
extract_images = True,
|
||||
# headers = None
|
||||
# extraction_mode = "plain",
|
||||
# extraction_kwargs = None,
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
LayoutParser : A Unified Toolkit for Deep
|
||||
Learning Based Document Image Analysis
|
||||
Zejiang Shen1( ), R
|
||||
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
|
||||
|
||||
# TODO: Delete if async load is not implemented
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
LayoutParser : A Unified Toolkit for Deep
|
||||
Learning Based Document Image Analysis
|
||||
Zejiang Shen1( ), R
|
||||
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@@ -39,7 +39,75 @@ def _build_metadata(soup: Any, url: str) -> dict:
|
||||
|
||||
|
||||
class WebBaseLoader(BaseLoader):
|
||||
"""Load HTML pages using `urllib` and parse them with `BeautifulSoup'."""
|
||||
"""
|
||||
WebBaseLoader document loader integration
|
||||
|
||||
Setup:
|
||||
Install ``langchain_community``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain_community
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
|
||||
loader = WebBaseLoader(
|
||||
web_path = "https://www.espn.com/"
|
||||
# header_template = None,
|
||||
# verify_ssl = True,
|
||||
# proxies = None,
|
||||
# continue_on_failure = False,
|
||||
# autoset_encoding = True,
|
||||
# encoding = None,
|
||||
# web_paths = (),
|
||||
# requests_per_second = 2,
|
||||
# default_parser = "html.parser",
|
||||
# requests_kwargs = None,
|
||||
# raise_for_status = False,
|
||||
# bs_get_text_kwargs = None,
|
||||
# bs_kwargs = None,
|
||||
# session = None,
|
||||
# show_progress = True,
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
ESPN - Serving Sports Fans. Anytime. Anywhere.
|
||||
|
||||
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
|
||||
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
ESPN - Serving Sports Fans. Anytime. Anywhere.
|
||||
|
||||
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
Reference in New Issue
Block a user