RecusiveUrlLoader to RecursiveUrlLoader (#6787)

This commit is contained in:
WaseemH 2023-06-27 02:12:14 -04:00 committed by GitHub
parent 4535b0b41e
commit 7ac9b22886
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 11 additions and 8 deletions

View File

@ -1,6 +1,7 @@
{ {
"cells": [ "cells": [
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "5a7cc773", "id": "5a7cc773",
"metadata": {}, "metadata": {},
@ -17,7 +18,7 @@
"\n", "\n",
"But, the challenge is traversing the tree of child pages and actually assembling that list!\n", "But, the challenge is traversing the tree of child pages and actually assembling that list!\n",
" \n", " \n",
"We do this using the `RecusiveUrlLoader`.\n", "We do this using the `RecursiveUrlLoader`.\n",
"\n", "\n",
"This also gives us the flexibility to exclude some children (e.g., the `api` directory with > 800 child pages)." "This also gives us the flexibility to exclude some children (e.g., the `api` directory with > 800 child pages)."
] ]
@ -29,10 +30,11 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from langchain.document_loaders.recursive_url_loader import RecusiveUrlLoader" "from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "6384c057", "id": "6384c057",
"metadata": {}, "metadata": {},
@ -48,7 +50,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"url = 'https://js.langchain.com/docs/modules/memory/examples/'\n", "url = 'https://js.langchain.com/docs/modules/memory/examples/'\n",
"loader=RecusiveUrlLoader(url=url)\n", "loader=RecursiveUrlLoader(url=url)\n",
"docs=loader.load()" "docs=loader.load()"
] ]
}, },
@ -119,6 +121,7 @@
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "40fc13ef", "id": "40fc13ef",
"metadata": {}, "metadata": {},
@ -137,7 +140,7 @@
"source": [ "source": [
"url = 'https://js.langchain.com/docs/'\n", "url = 'https://js.langchain.com/docs/'\n",
"exclude_dirs=['https://js.langchain.com/docs/api/']\n", "exclude_dirs=['https://js.langchain.com/docs/api/']\n",
"loader=RecusiveUrlLoader(url=url,exclude_dirs=exclude_dirs)\n", "loader=RecursiveUrlLoader(url=url,exclude_dirs=exclude_dirs)\n",
"docs=loader.load()" "docs=loader.load()"
] ]
}, },

View File

@ -95,7 +95,7 @@ from langchain.document_loaders.psychic import PsychicLoader
from langchain.document_loaders.pyspark_dataframe import PySparkDataFrameLoader from langchain.document_loaders.pyspark_dataframe import PySparkDataFrameLoader
from langchain.document_loaders.python import PythonLoader from langchain.document_loaders.python import PythonLoader
from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.readthedocs import ReadTheDocsLoader
from langchain.document_loaders.recursive_url_loader import RecusiveUrlLoader from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain.document_loaders.reddit import RedditPostsLoader from langchain.document_loaders.reddit import RedditPostsLoader
from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.rst import UnstructuredRSTLoader from langchain.document_loaders.rst import UnstructuredRSTLoader
@ -230,7 +230,7 @@ __all__ = [
"PySparkDataFrameLoader", "PySparkDataFrameLoader",
"PythonLoader", "PythonLoader",
"ReadTheDocsLoader", "ReadTheDocsLoader",
"RecusiveUrlLoader", "RecursiveUrlLoader",
"RedditPostsLoader", "RedditPostsLoader",
"RoamLoader", "RoamLoader",
"S3DirectoryLoader", "S3DirectoryLoader",

View File

@ -7,7 +7,7 @@ from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
class RecusiveUrlLoader(BaseLoader): class RecursiveUrlLoader(BaseLoader):
"""Loader that loads all child links from a given url.""" """Loader that loads all child links from a given url."""
def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None: def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None:
@ -24,7 +24,7 @@ class RecusiveUrlLoader(BaseLoader):
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"The BeautifulSoup package is required for the RecusiveUrlLoader." "The BeautifulSoup package is required for the RecursiveUrlLoader."
) )
# Construct the base and parent URLs # Construct the base and parent URLs