[docs]: standardize doc loader doc strings (#25325)

This commit is contained in:
Isaac Francisco
2024-08-13 16:18:56 -07:00
committed by GitHub
parent e0bbb81d04
commit f4ffd692a3
8 changed files with 345 additions and 70 deletions

View File

@@ -39,7 +39,75 @@ def _build_metadata(soup: Any, url: str) -> dict:
class WebBaseLoader(BaseLoader):
"""Load HTML pages using `urllib` and parse them with `BeautifulSoup'."""
"""
WebBaseLoader document loader integration
Setup:
Install ``langchain_community``.
.. code-block:: bash
pip install -U langchain_community
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
web_path = "https://www.espn.com/"
# header_template = None,
# verify_ssl = True,
# proxies = None,
# continue_on_failure = False,
# autoset_encoding = True,
# encoding = None,
# web_paths = (),
# requests_per_second = 2,
# default_parser = "html.parser",
# requests_kwargs = None,
# raise_for_status = False,
# bs_get_text_kwargs = None,
# bs_kwargs = None,
# session = None,
# show_progress = True,
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
ESPN - Serving Sports Fans. Anytime. Anywhere.
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
ESPN - Serving Sports Fans. Anytime. Anywhere.
{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
""" # noqa: E501
def __init__(
self,