mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-17 10:13:29 +00:00
RecusiveUrlLoader
to RecursiveUrlLoader
(#6787)
This commit is contained in:
parent
4535b0b41e
commit
7ac9b22886
@ -1,6 +1,7 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "5a7cc773",
|
"id": "5a7cc773",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -17,7 +18,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"But, the challenge is traversing the tree of child pages and actually assembling that list!\n",
|
"But, the challenge is traversing the tree of child pages and actually assembling that list!\n",
|
||||||
" \n",
|
" \n",
|
||||||
"We do this using the `RecusiveUrlLoader`.\n",
|
"We do this using the `RecursiveUrlLoader`.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"This also gives us the flexibility to exclude some children (e.g., the `api` directory with > 800 child pages)."
|
"This also gives us the flexibility to exclude some children (e.g., the `api` directory with > 800 child pages)."
|
||||||
]
|
]
|
||||||
@ -29,10 +30,11 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from langchain.document_loaders.recursive_url_loader import RecusiveUrlLoader"
|
"from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "6384c057",
|
"id": "6384c057",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -48,7 +50,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"url = 'https://js.langchain.com/docs/modules/memory/examples/'\n",
|
"url = 'https://js.langchain.com/docs/modules/memory/examples/'\n",
|
||||||
"loader=RecusiveUrlLoader(url=url)\n",
|
"loader=RecursiveUrlLoader(url=url)\n",
|
||||||
"docs=loader.load()"
|
"docs=loader.load()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -119,6 +121,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "40fc13ef",
|
"id": "40fc13ef",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -137,7 +140,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"url = 'https://js.langchain.com/docs/'\n",
|
"url = 'https://js.langchain.com/docs/'\n",
|
||||||
"exclude_dirs=['https://js.langchain.com/docs/api/']\n",
|
"exclude_dirs=['https://js.langchain.com/docs/api/']\n",
|
||||||
"loader=RecusiveUrlLoader(url=url,exclude_dirs=exclude_dirs)\n",
|
"loader=RecursiveUrlLoader(url=url,exclude_dirs=exclude_dirs)\n",
|
||||||
"docs=loader.load()"
|
"docs=loader.load()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -95,7 +95,7 @@ from langchain.document_loaders.psychic import PsychicLoader
|
|||||||
from langchain.document_loaders.pyspark_dataframe import PySparkDataFrameLoader
|
from langchain.document_loaders.pyspark_dataframe import PySparkDataFrameLoader
|
||||||
from langchain.document_loaders.python import PythonLoader
|
from langchain.document_loaders.python import PythonLoader
|
||||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||||
from langchain.document_loaders.recursive_url_loader import RecusiveUrlLoader
|
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
|
||||||
from langchain.document_loaders.reddit import RedditPostsLoader
|
from langchain.document_loaders.reddit import RedditPostsLoader
|
||||||
from langchain.document_loaders.roam import RoamLoader
|
from langchain.document_loaders.roam import RoamLoader
|
||||||
from langchain.document_loaders.rst import UnstructuredRSTLoader
|
from langchain.document_loaders.rst import UnstructuredRSTLoader
|
||||||
@ -230,7 +230,7 @@ __all__ = [
|
|||||||
"PySparkDataFrameLoader",
|
"PySparkDataFrameLoader",
|
||||||
"PythonLoader",
|
"PythonLoader",
|
||||||
"ReadTheDocsLoader",
|
"ReadTheDocsLoader",
|
||||||
"RecusiveUrlLoader",
|
"RecursiveUrlLoader",
|
||||||
"RedditPostsLoader",
|
"RedditPostsLoader",
|
||||||
"RoamLoader",
|
"RoamLoader",
|
||||||
"S3DirectoryLoader",
|
"S3DirectoryLoader",
|
||||||
|
@ -7,7 +7,7 @@ from langchain.docstore.document import Document
|
|||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
class RecusiveUrlLoader(BaseLoader):
|
class RecursiveUrlLoader(BaseLoader):
|
||||||
"""Loader that loads all child links from a given url."""
|
"""Loader that loads all child links from a given url."""
|
||||||
|
|
||||||
def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None:
|
def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None:
|
||||||
@ -24,7 +24,7 @@ class RecusiveUrlLoader(BaseLoader):
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"The BeautifulSoup package is required for the RecusiveUrlLoader."
|
"The BeautifulSoup package is required for the RecursiveUrlLoader."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Construct the base and parent URLs
|
# Construct the base and parent URLs
|
||||||
|
Loading…
Reference in New Issue
Block a user