mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 16:13:25 +00:00
docs: fix broken Appearance of langchain_community/document_loaders/recursive_url_loader API Reference (#29305)
# PR mesesage ## Description Fixed a broken Appearance of RecurisveUrlLoader API Reference. ### Before <p align="center"> <img width="750" alt="image" src="https://github.com/user-attachments/assets/f39df65d-b788-411d-88af-8bfa2607c00b" /> <img width="750" alt="image" src="https://github.com/user-attachments/assets/b8a92b70-4548-4b4a-965f-026faeebd0ec" /> </p> ### After <p align="center"> <img width="750" alt="image" src="https://github.com/user-attachments/assets/8ea28146-de45-42e2-b346-3004ec4dfc55" /> <img width="750" alt="image" src="https://github.com/user-attachments/assets/914c6966-4055-45d3-baeb-2d97eab06fe7" /> </p> ## Issue: N/A ## Dependencies None ## Twitter handle N/A # Add tests and docs Not applicable; this change only affects documentation. # Lint and test Ran make format, make lint, and make test to ensure no issues.
This commit is contained in:
parent
6c52378992
commit
5d64597490
@ -53,7 +53,8 @@ def _metadata_extractor(
|
|||||||
class RecursiveUrlLoader(BaseLoader):
|
class RecursiveUrlLoader(BaseLoader):
|
||||||
"""Recursively load all child links from a root URL.
|
"""Recursively load all child links from a root URL.
|
||||||
|
|
||||||
**Security Note**: This loader is a crawler that will start crawling
|
**Security Note**:
|
||||||
|
This loader is a crawler that will start crawling
|
||||||
at a given URL and then expand to crawl child links recursively.
|
at a given URL and then expand to crawl child links recursively.
|
||||||
|
|
||||||
Web crawlers should generally NOT be deployed with network access
|
Web crawlers should generally NOT be deployed with network access
|
||||||
@ -154,36 +155,36 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
content. To parse this HTML into a more human/LLM-friendly format you can pass
|
content. To parse this HTML into a more human/LLM-friendly format you can pass
|
||||||
in a custom ``extractor`` method:
|
in a custom ``extractor`` method:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
# This example uses `beautifulsoup4` and `lxml`
|
# This example uses `beautifulsoup4` and `lxml`
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
def bs4_extractor(html: str) -> str:
|
def bs4_extractor(html: str) -> str:
|
||||||
soup = BeautifulSoup(html, "lxml")
|
soup = BeautifulSoup(html, "lxml")
|
||||||
return re.sub(r"\n\n+", "\n\n", soup.text).strip()
|
return re.sub(r"\\n\\n+", "\\n\\n", soup.text).strip()
|
||||||
|
|
||||||
loader = RecursiveUrlLoader(
|
loader = RecursiveUrlLoader(
|
||||||
"https://docs.python.org/3.9/",
|
"https://docs.python.org/3.9/",
|
||||||
extractor=bs4_extractor,
|
extractor=bs4_extractor,
|
||||||
)
|
)
|
||||||
print(loader.load()[0].page_content[:200])
|
print(loader.load()[0].page_content[:200])
|
||||||
|
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
3.9.19 Documentation
|
3.9.19 Documentation
|
||||||
|
|
||||||
Download
|
Download
|
||||||
Download these documents
|
Download these documents
|
||||||
Docs by version
|
Docs by version
|
||||||
|
|
||||||
Python 3.13 (in development)
|
Python 3.13 (in development)
|
||||||
Python 3.12 (stable)
|
Python 3.12 (stable)
|
||||||
Python 3.11 (security-fixes)
|
Python 3.11 (security-fixes)
|
||||||
Python 3.10 (security-fixes)
|
Python 3.10 (security-fixes)
|
||||||
Python 3.9 (securit
|
Python 3.9 (securit
|
||||||
|
|
||||||
Metadata extraction:
|
Metadata extraction:
|
||||||
Similarly to content extraction, you can specify a metadata extraction function
|
Similarly to content extraction, you can specify a metadata extraction function
|
||||||
|
Loading…
Reference in New Issue
Block a user