Recursive URL loader (#6455)

We may want to process load all URLs under a root directory.

For example, let's look at the [LangChain JS
documentation](https://js.langchain.com/docs/).

This has many interesting child pages that we may want to read in bulk.

Of course, the `WebBaseLoader` can load a list of pages. 

But, the challenge is traversing the tree of child pages and actually
assembling that list!
 
We do this using the `RecusiveUrlLoader`.

This also gives us the flexibility to exclude some children (e.g., the
`api` directory with > 800 child pages).
This commit is contained in:
Lance Martin 2023-06-23 13:09:00 -07:00 committed by GitHub
parent be02572d58
commit c2b25c17c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 327 additions and 6 deletions

View File

@ -0,0 +1,232 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "5a7cc773",
"metadata": {},
"source": [
"# Recursive URL Loader\n",
"\n",
"We may want to process load all URLs under a root directory.\n",
"\n",
"For example, let's look at the [LangChain JS documentation](https://js.langchain.com/docs/).\n",
"\n",
"This has many interesting child pages that we may want to read in bulk.\n",
"\n",
"Of course, the `WebBaseLoader` can load a list of pages. \n",
"\n",
"But, the challenge is traversing the tree of child pages and actually assembling that list!\n",
" \n",
"We do this using the `RecusiveUrlLoader`.\n",
"\n",
"This also gives us the flexibility to exclude some children (e.g., the `api` directory with > 800 child pages)."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2e3532b2",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders.recursive_url_loader import RecusiveUrlLoader"
]
},
{
"cell_type": "markdown",
"id": "6384c057",
"metadata": {},
"source": [
"Let's try a simple example."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d69e5620",
"metadata": {},
"outputs": [],
"source": [
"url = 'https://js.langchain.com/docs/modules/memory/examples/'\n",
"loader=RecusiveUrlLoader(url=url)\n",
"docs=loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "084fb2ce",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"12"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "89355b7c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\n\\n\\n\\n\\nDynamoDB-Backed Chat Memory | \\uf8ffü¶úÔ∏è\\uf8ffüîó Lan'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0].page_content[:50]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "13bd7e16",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'source': 'https://js.langchain.com/docs/modules/memory/examples/dynamodb',\n",
" 'title': 'DynamoDB-Backed Chat Memory | \\uf8ffü¶úÔ∏è\\uf8ffüîó Langchain',\n",
" 'description': 'For longer-term persistence across chat sessions, you can swap out the default in-memory chatHistory that backs chat memory classes like BufferMemory for a DynamoDB instance.',\n",
" 'language': 'en'}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0].metadata"
]
},
{
"cell_type": "markdown",
"id": "40fc13ef",
"metadata": {},
"source": [
"Now, let's try a more extensive example, the `docs` root dir.\n",
"\n",
"We will skip everything under `api`."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "30ff61d3",
"metadata": {},
"outputs": [],
"source": [
"url = 'https://js.langchain.com/docs/'\n",
"exclude_dirs=['https://js.langchain.com/docs/api/']\n",
"loader=RecusiveUrlLoader(url=url,exclude_dirs=exclude_dirs)\n",
"docs=loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "457e30f3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"176"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "bca80b4a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\n\\n\\n\\n\\nHacker News | \\uf8ffü¶úÔ∏è\\uf8ffüîó Langchain\\n\\n\\n\\n\\n\\nSkip'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0].page_content[:50]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "df97cf22",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'source': 'https://js.langchain.com/docs/modules/indexes/document_loaders/examples/web_loaders/hn',\n",
" 'title': 'Hacker News | \\uf8ffü¶úÔ∏è\\uf8ffüîó Langchain',\n",
" 'description': 'This example goes over how to load data from the hacker news website, using Cheerio. One document will be created for each page.',\n",
" 'language': 'en'}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0].metadata"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -94,6 +94,7 @@ from langchain.document_loaders.psychic import PsychicLoader
from langchain.document_loaders.pyspark_dataframe import PySparkDataFrameLoader from langchain.document_loaders.pyspark_dataframe import PySparkDataFrameLoader
from langchain.document_loaders.python import PythonLoader from langchain.document_loaders.python import PythonLoader
from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.readthedocs import ReadTheDocsLoader
from langchain.document_loaders.recursive_url_loader import RecusiveUrlLoader
from langchain.document_loaders.reddit import RedditPostsLoader from langchain.document_loaders.reddit import RedditPostsLoader
from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.rtf import UnstructuredRTFLoader from langchain.document_loaders.rtf import UnstructuredRTFLoader
@ -226,6 +227,7 @@ __all__ = [
"PySparkDataFrameLoader", "PySparkDataFrameLoader",
"PythonLoader", "PythonLoader",
"ReadTheDocsLoader", "ReadTheDocsLoader",
"RecusiveUrlLoader",
"RedditPostsLoader", "RedditPostsLoader",
"RoamLoader", "RoamLoader",
"S3DirectoryLoader", "S3DirectoryLoader",

View File

@ -0,0 +1,86 @@
from typing import Iterator, List, Optional, Set
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class RecusiveUrlLoader(BaseLoader):
"""Loader that loads all child links from a given url."""
def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None:
"""Initialize with URL to crawl and any sub-directories to exclude."""
self.url = url
self.exclude_dirs = exclude_dirs
def get_child_links_recursive(
self, url: str, visited: Optional[Set[str]] = None
) -> Set[str]:
"""Recursively get all child links starting with the path of the input URL."""
# Construct the base and parent URLs
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
parent_url = "/".join(parsed_url.path.split("/")[:-1])
current_path = parsed_url.path
# Add a trailing slash if not present
if not base_url.endswith("/"):
base_url += "/"
if not parent_url.endswith("/"):
parent_url += "/"
# Exclude the root and parent from list
visited = set() if visited is None else visited
# Exclude the links that start with any of the excluded directories
if self.exclude_dirs and any(
url.startswith(exclude_dir) for exclude_dir in self.exclude_dirs
):
return visited
# Get all links that are relative to the root of the website
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
all_links = [link.get("href") for link in soup.find_all("a")]
# Extract only the links that are children of the current URL
child_links = list(
{
link
for link in all_links
if link and link.startswith(current_path) and link != current_path
}
)
# Get absolute path for all root relative links listed
absolute_paths = [
f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}{link}"
for link in child_links
]
# Store the visited links and recursively visit the children
for link in absolute_paths:
# Check all unvisited links
if link not in visited:
visited.add(link)
# If the link is a directory (w/ children) then visit it
if link.endswith("/"):
visited.update(self.get_child_links_recursive(link, visited))
return visited
def lazy_load(self) -> Iterator[Document]:
from langchain.document_loaders import WebBaseLoader
"""Lazy load web pages."""
child_links = self.get_child_links_recursive(self.url)
loader = WebBaseLoader(list(child_links))
return loader.lazy_load()
def load(self) -> List[Document]:
"""Load web pages."""
return list(self.lazy_load())

View File

@ -2,7 +2,7 @@
import asyncio import asyncio
import logging import logging
import warnings import warnings
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, Iterator, List, Optional, Union
import aiohttp import aiohttp
import requests import requests
@ -197,16 +197,17 @@ class WebBaseLoader(BaseLoader):
return self._scrape(self.web_path, parser) return self._scrape(self.web_path, parser)
def load(self) -> List[Document]: def lazy_load(self) -> Iterator[Document]:
"""Load text from the url(s) in web_path.""" """Lazy load text from the url(s) in web_path."""
docs = []
for path in self.web_paths: for path in self.web_paths:
soup = self._scrape(path) soup = self._scrape(path)
text = soup.get_text() text = soup.get_text()
metadata = _build_metadata(soup, path) metadata = _build_metadata(soup, path)
docs.append(Document(page_content=text, metadata=metadata)) yield Document(page_content=text, metadata=metadata)
return docs def load(self) -> List[Document]:
"""Load text from the url(s) in web_path."""
return list(self.lazy_load())
def aload(self) -> List[Document]: def aload(self) -> List[Document]:
"""Load text from the urls in web_path async into Documents.""" """Load text from the urls in web_path async into Documents."""