mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 05:13:46 +00:00
Support gitbooks in a subdirectory
This commit is contained in:
parent
a5b05dcc66
commit
e77b63bdad
@ -1,6 +1,6 @@
|
|||||||
"""Loader that loads GitBook."""
|
"""Loader that loads GitBook."""
|
||||||
from typing import Any, List, Optional
|
from typing import Any, List, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.web_base import WebBaseLoader
|
from langchain.document_loaders.web_base import WebBaseLoader
|
||||||
@ -47,7 +47,7 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
relative_paths = self._get_paths(soup_info)
|
relative_paths = self._get_paths(soup_info)
|
||||||
documents = []
|
documents = []
|
||||||
for path in relative_paths:
|
for path in relative_paths:
|
||||||
url = self.base_url + path
|
url = urljoin(self.base_url, path)
|
||||||
print(f"Fetching text from {url}")
|
print(f"Fetching text from {url}")
|
||||||
soup_info = self._scrape(url)
|
soup_info = self._scrape(url)
|
||||||
documents.append(self._get_document(soup_info, url))
|
documents.append(self._get_document(soup_info, url))
|
||||||
|
Loading…
Reference in New Issue
Block a user