[docs]: standardize doc loader doc strings (#25325)

This commit is contained in:
Isaac Francisco 2024-08-13 16:18:56 -07:00 committed by GitHub
parent e0bbb81d04
commit f4ffd692a3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 345 additions and 70 deletions

File diff suppressed because one or more lines are too long

View File

@ -122,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -131,21 +131,41 @@
"6"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"page = []\n",
"pages = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" pages.append(doc)\n",
" if len(pages) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []\n",
"len(page)"
" pages = []\n",
"len(pages)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LayoutParser : A Unified Toolkit for DL-Based DIA 11\n",
"focuses on precision, efficiency, and robustness. \n",
"{'source': './example_data/layout-parser-paper.pdf', 'page': 10}\n"
]
}
],
"source": [
"print(pages[0].page_content[:100])\n",
"print(pages[0].metadata)"
]
},
{

View File

@ -229,14 +229,14 @@
}
],
"source": [
"page = []\n",
"pages = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" pages.append(doc)\n",
" if len(pages) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []"
" pages = []"
]
},
{

File diff suppressed because one or more lines are too long

View File

@ -6,11 +6,63 @@ from langchain_core.utils import get_from_env
class FireCrawlLoader(BaseLoader):
"""Load web pages as Documents using FireCrawl.
Must have Python package `firecrawl` installed and a FireCrawl API key. See
https://www.firecrawl.dev/ for more.
"""
FireCrawlLoader document loader integration
Setup:
Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``.
.. code-block:: bash
pip install -U firecrawl-py langchain_community
export FIRECRAWL_API_KEY="your-api-key"
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import FireCrawlLoader
loader = FireCrawlLoader(
url = "https://firecrawl.dev",
mode = "crawl"
# other params = ...
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
Join the waitlist to turn any web
{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
Join the waitlist to turn any web
{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
""" # noqa: E501
def __init__(
self,

View File

@ -161,10 +161,66 @@ class OnlinePDFLoader(BasePDFLoader):
class PyPDFLoader(BasePDFLoader):
"""Load PDF using pypdf into list of documents.
Loader chunks by page and stores page numbers in metadata.
"""
PyPDFLoader document loader integration
Setup:
Install ``langchain-community``.
.. code-block:: bash
pip install -U langchain-community
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(
file_path = "./example_data/layout-parser-paper.pdf",
password = "my-pasword",
extract_images = True,
# headers = None
# extraction_mode = "plain",
# extraction_kwargs = None,
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
LayoutParser : A Unified Toolkit for Deep
Learning Based Document Image Analysis
Zejiang Shen1( ), R
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
# TODO: Delete if async load is not implemented
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
LayoutParser : A Unified Toolkit for Deep
Learning Based Document Image Analysis
Zejiang Shen1( ), R
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
""" # noqa: E501
def __init__(
self,

View File

@ -39,7 +39,75 @@ def _build_metadata(soup: Any, url: str) -> dict:
class WebBaseLoader(BaseLoader):
"""Load HTML pages using `urllib` and parse them with `BeautifulSoup'."""
"""
WebBaseLoader document loader integration
Setup:
Install ``langchain_community``.
.. code-block:: bash
pip install -U langchain_community
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
web_path = "https://www.espn.com/"
# header_template = None,
# verify_ssl = True,
# proxies = None,
# continue_on_failure = False,
# autoset_encoding = True,
# encoding = None,
# web_paths = (),
# requests_per_second = 2,
# default_parser = "html.parser",
# requests_kwargs = None,
# raise_for_status = False,
# bs_get_text_kwargs = None,
# bs_kwargs = None,
# session = None,
# show_progress = True,
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
ESPN - Serving Sports Fans. Anytime. Anywhere.
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
ESPN - Serving Sports Fans. Anytime. Anywhere.
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
""" # noqa: E501
def __init__(
self,

View File

@ -24,29 +24,9 @@ _DEFAULT_URL = "https://api.unstructuredapp.io/general/v0/general"
class UnstructuredLoader(BaseLoader):
"""Unstructured document loader interface.
Partition and load files using either the `unstructured-client` sdk and the
Unstructured API or locally using the `unstructured` library.
API:
This package is configured to work with the Unstructured API by default.
To use the Unstructured API, set
`partition_via_api=True` and define `api_key`. If you are running the unstructured
API locally, you can change the API rule by defining `url` when you initialize the
loader. The hosted Unstructured API requires an API key. See the links below to
learn more about our API offerings and get an API key.
Local:
To partition files locally, you must have the `unstructured` package installed.
You can install it with `pip install unstructured`.
By default the file loader uses the Unstructured `partition` function and will
automatically detect the file type.
In addition to document specific partition parameters, Unstructured has a rich set
of "chunking" parameters for post-processing elements into more useful text segments
for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional
Unstructured kwargs to the loader to configure different unstructured settings.
Setup:
Install ``langchain-unstructured`` and set environment variable ``UNSTRUCTURED_API_KEY``.
.. code-block:: bash
pip install -U langchain-unstructured
export UNSTRUCTURED_API_KEY="your-api-key"
@ -63,20 +43,46 @@ class UnstructuredLoader(BaseLoader):
strategy="fast",
)
Load:
Lazy load:
.. code-block:: python
docs = loader.load()
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
1 2 0 2
{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
1 2 0 2
{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}
References
----------
https://docs.unstructured.io/api-reference/api-services/sdk
https://docs.unstructured.io/api-reference/api-services/overview
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
"""
""" # noqa: E501
def __init__(
self,