mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-06 05:08:20 +00:00
Add optional base_url
arg to GitbookLoader
(#1552)
First of all, big kudos on what you guys are doing, langchain is enabling some really amazing usecases and I'm having lot's of fun playing around with it. It's really cool how many data sources it supports out of the box. However, I noticed some limitations of the current `GitbookLoader` which this PR adresses: The main change is that I added an optional `base_url` arg to `GitbookLoader`. This enables use cases where one wants to crawl docs from a start page other than the index page, e.g., the following call would scrape all pages that are reachable via nav bar links from "https://docs.zenml.io/v/0.35.0": ```python GitbookLoader( web_page="https://docs.zenml.io/v/0.35.0", load_all_paths=True, base_url="https://docs.zenml.io", ) ``` Previously, this would fail because relative links would be of the form `/v/0.35.0/...` and the full link URLs would become `docs.zenml.io/v/0.35.0/v/0.35.0/...`. I also fixed another issue of the `GitbookLoader` where the link URLs were constructed incorrectly as `website//relative_url` if the provided `web_page` had a trailing slash.
This commit is contained in:
parent
c9189d354a
commit
b44c8bd969
@ -12,9 +12,26 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
2. load all (relative) paths in the navbar.
|
2. load all (relative) paths in the navbar.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, web_page: str, load_all_paths: bool = False):
|
def __init__(
|
||||||
"""Initialize with web page and whether to load all paths."""
|
self,
|
||||||
|
web_page: str,
|
||||||
|
load_all_paths: bool = False,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""Initialize with web page and whether to load all paths.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
web_page: The web page to load or the starting point from where
|
||||||
|
relative paths are discovered.
|
||||||
|
load_all_paths: If set to True, all relative paths in the navbar
|
||||||
|
are loaded instead of only `web_page`.
|
||||||
|
base_url: If `load_all_paths` is True, the relative paths are
|
||||||
|
appended to this base url. Defaults to `web_page` if not set.
|
||||||
|
"""
|
||||||
super().__init__(web_page)
|
super().__init__(web_page)
|
||||||
|
self.base_url = base_url or web_page
|
||||||
|
if self.base_url.endswith("/"):
|
||||||
|
self.base_url = self.base_url[:-1]
|
||||||
self.load_all_paths = load_all_paths
|
self.load_all_paths = load_all_paths
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
@ -24,7 +41,7 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
relative_paths = self._get_paths(soup_info)
|
relative_paths = self._get_paths(soup_info)
|
||||||
documents = []
|
documents = []
|
||||||
for path in relative_paths:
|
for path in relative_paths:
|
||||||
url = self.web_path + path
|
url = self.base_url + path
|
||||||
print(f"Fetching text from {url}")
|
print(f"Fetching text from {url}")
|
||||||
soup_info = self._scrape(url)
|
soup_info = self._scrape(url)
|
||||||
documents.append(self._get_document(soup_info, url))
|
documents.append(self._get_document(soup_info, url))
|
||||||
|
Loading…
Reference in New Issue
Block a user