From 5d64597490cc58d2f3a84b4f6bbb71b49abcd168 Mon Sep 17 00:00:00 2001 From: Sangyun_LEE Date: Tue, 21 Jan 2025 00:56:59 +0900 Subject: [PATCH] docs: fix broken Appearance of langchain_community/document_loaders/recursive_url_loader API Reference (#29305) # PR mesesage ## Description Fixed a broken Appearance of RecurisveUrlLoader API Reference. ### Before

image image

### After

image image

## Issue: N/A ## Dependencies None ## Twitter handle N/A # Add tests and docs Not applicable; this change only affects documentation. # Lint and test Ran make format, make lint, and make test to ensure no issues. --- .../document_loaders/recursive_url_loader.py | 47 ++++++++++--------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py index ebf119bafc2..9c63a71ac31 100644 --- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py +++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py @@ -53,7 +53,8 @@ def _metadata_extractor( class RecursiveUrlLoader(BaseLoader): """Recursively load all child links from a root URL. - **Security Note**: This loader is a crawler that will start crawling + **Security Note**: + This loader is a crawler that will start crawling at a given URL and then expand to crawl child links recursively. Web crawlers should generally NOT be deployed with network access @@ -154,36 +155,36 @@ class RecursiveUrlLoader(BaseLoader): content. To parse this HTML into a more human/LLM-friendly format you can pass in a custom ``extractor`` method: - .. code-block:: python + .. code-block:: python - # This example uses `beautifulsoup4` and `lxml` - import re - from bs4 import BeautifulSoup + # This example uses `beautifulsoup4` and `lxml` + import re + from bs4 import BeautifulSoup - def bs4_extractor(html: str) -> str: - soup = BeautifulSoup(html, "lxml") - return re.sub(r"\n\n+", "\n\n", soup.text).strip() + def bs4_extractor(html: str) -> str: + soup = BeautifulSoup(html, "lxml") + return re.sub(r"\\n\\n+", "\\n\\n", soup.text).strip() - loader = RecursiveUrlLoader( - "https://docs.python.org/3.9/", - extractor=bs4_extractor, - ) - print(loader.load()[0].page_content[:200]) + loader = RecursiveUrlLoader( + "https://docs.python.org/3.9/", + extractor=bs4_extractor, + ) + print(loader.load()[0].page_content[:200]) - .. code-block:: python + .. code-block:: python - 3.9.19 Documentation + 3.9.19 Documentation - Download - Download these documents - Docs by version + Download + Download these documents + Docs by version - Python 3.13 (in development) - Python 3.12 (stable) - Python 3.11 (security-fixes) - Python 3.10 (security-fixes) - Python 3.9 (securit + Python 3.13 (in development) + Python 3.12 (stable) + Python 3.11 (security-fixes) + Python 3.10 (security-fixes) + Python 3.9 (securit Metadata extraction: Similarly to content extraction, you can specify a metadata extraction function