mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 15:43:54 +00:00
[docs]: standardize doc loader doc strings (#25325)
This commit is contained in:
parent
e0bbb81d04
commit
f4ffd692a3
File diff suppressed because one or more lines are too long
@ -122,7 +122,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -131,21 +131,41 @@
|
||||
"6"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"pages = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" pages.append(doc)\n",
|
||||
" if len(pages) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []\n",
|
||||
"len(page)"
|
||||
" pages = []\n",
|
||||
"len(pages)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"LayoutParser : A Unified Toolkit for DL-Based DIA 11\n",
|
||||
"focuses on precision, efficiency, and robustness. \n",
|
||||
"{'source': './example_data/layout-parser-paper.pdf', 'page': 10}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(pages[0].page_content[:100])\n",
|
||||
"print(pages[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -229,14 +229,14 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"pages = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" pages.append(doc)\n",
|
||||
" if len(pages) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
" pages = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
File diff suppressed because one or more lines are too long
@ -6,11 +6,63 @@ from langchain_core.utils import get_from_env
|
||||
|
||||
|
||||
class FireCrawlLoader(BaseLoader):
|
||||
"""Load web pages as Documents using FireCrawl.
|
||||
|
||||
Must have Python package `firecrawl` installed and a FireCrawl API key. See
|
||||
https://www.firecrawl.dev/ for more.
|
||||
"""
|
||||
FireCrawlLoader document loader integration
|
||||
|
||||
Setup:
|
||||
Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U firecrawl-py langchain_community
|
||||
export FIRECRAWL_API_KEY="your-api-key"
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import FireCrawlLoader
|
||||
|
||||
loader = FireCrawlLoader(
|
||||
url = "https://firecrawl.dev",
|
||||
mode = "crawl"
|
||||
# other params = ...
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
|
||||
Join the waitlist to turn any web
|
||||
{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
|
||||
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
|
||||
Join the waitlist to turn any web
|
||||
{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -161,10 +161,66 @@ class OnlinePDFLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class PyPDFLoader(BasePDFLoader):
|
||||
"""Load PDF using pypdf into list of documents.
|
||||
|
||||
Loader chunks by page and stores page numbers in metadata.
|
||||
"""
|
||||
PyPDFLoader document loader integration
|
||||
|
||||
Setup:
|
||||
Install ``langchain-community``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
|
||||
loader = PyPDFLoader(
|
||||
file_path = "./example_data/layout-parser-paper.pdf",
|
||||
password = "my-pasword",
|
||||
extract_images = True,
|
||||
# headers = None
|
||||
# extraction_mode = "plain",
|
||||
# extraction_kwargs = None,
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
LayoutParser : A Unified Toolkit for Deep
|
||||
Learning Based Document Image Analysis
|
||||
Zejiang Shen1( ), R
|
||||
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
|
||||
|
||||
# TODO: Delete if async load is not implemented
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
LayoutParser : A Unified Toolkit for Deep
|
||||
Learning Based Document Image Analysis
|
||||
Zejiang Shen1( ), R
|
||||
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -39,7 +39,75 @@ def _build_metadata(soup: Any, url: str) -> dict:
|
||||
|
||||
|
||||
class WebBaseLoader(BaseLoader):
|
||||
"""Load HTML pages using `urllib` and parse them with `BeautifulSoup'."""
|
||||
"""
|
||||
WebBaseLoader document loader integration
|
||||
|
||||
Setup:
|
||||
Install ``langchain_community``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain_community
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
|
||||
loader = WebBaseLoader(
|
||||
web_path = "https://www.espn.com/"
|
||||
# header_template = None,
|
||||
# verify_ssl = True,
|
||||
# proxies = None,
|
||||
# continue_on_failure = False,
|
||||
# autoset_encoding = True,
|
||||
# encoding = None,
|
||||
# web_paths = (),
|
||||
# requests_per_second = 2,
|
||||
# default_parser = "html.parser",
|
||||
# requests_kwargs = None,
|
||||
# raise_for_status = False,
|
||||
# bs_get_text_kwargs = None,
|
||||
# bs_kwargs = None,
|
||||
# session = None,
|
||||
# show_progress = True,
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
ESPN - Serving Sports Fans. Anytime. Anywhere.
|
||||
|
||||
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
|
||||
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
ESPN - Serving Sports Fans. Anytime. Anywhere.
|
||||
|
||||
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -24,29 +24,9 @@ _DEFAULT_URL = "https://api.unstructuredapp.io/general/v0/general"
|
||||
class UnstructuredLoader(BaseLoader):
|
||||
"""Unstructured document loader interface.
|
||||
|
||||
Partition and load files using either the `unstructured-client` sdk and the
|
||||
Unstructured API or locally using the `unstructured` library.
|
||||
|
||||
API:
|
||||
This package is configured to work with the Unstructured API by default.
|
||||
To use the Unstructured API, set
|
||||
`partition_via_api=True` and define `api_key`. If you are running the unstructured
|
||||
API locally, you can change the API rule by defining `url` when you initialize the
|
||||
loader. The hosted Unstructured API requires an API key. See the links below to
|
||||
learn more about our API offerings and get an API key.
|
||||
|
||||
Local:
|
||||
To partition files locally, you must have the `unstructured` package installed.
|
||||
You can install it with `pip install unstructured`.
|
||||
By default the file loader uses the Unstructured `partition` function and will
|
||||
automatically detect the file type.
|
||||
|
||||
In addition to document specific partition parameters, Unstructured has a rich set
|
||||
of "chunking" parameters for post-processing elements into more useful text segments
|
||||
for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional
|
||||
Unstructured kwargs to the loader to configure different unstructured settings.
|
||||
|
||||
Setup:
|
||||
Install ``langchain-unstructured`` and set environment variable ``UNSTRUCTURED_API_KEY``.
|
||||
|
||||
.. code-block:: bash
|
||||
pip install -U langchain-unstructured
|
||||
export UNSTRUCTURED_API_KEY="your-api-key"
|
||||
@ -63,20 +43,46 @@ class UnstructuredLoader(BaseLoader):
|
||||
strategy="fast",
|
||||
)
|
||||
|
||||
Load:
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
docs = loader.load()
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
1 2 0 2
|
||||
{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}
|
||||
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
1 2 0 2
|
||||
{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}
|
||||
|
||||
|
||||
References
|
||||
----------
|
||||
https://docs.unstructured.io/api-reference/api-services/sdk
|
||||
https://docs.unstructured.io/api-reference/api-services/overview
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
Loading…
Reference in New Issue
Block a user