mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 12:48:12 +00:00
More comprehensive readthedocs document loader (#12382)
## **Description:** When building our own readthedocs.io scraper, we noticed a couple interesting things: 1. Text lines with a lot of nested <span> tags would give unclean text with a bunch of newlines. For example, for [Langchain's documentation](https://api.python.langchain.com/en/latest/document_loaders/langchain.document_loaders.readthedocs.ReadTheDocsLoader.html#langchain.document_loaders.readthedocs.ReadTheDocsLoader), a single line is represented in a complicated nested HTML structure, and the naive `soup.get_text()` call currently being made will create a newline for each nested HTML element. Therefore, the document loader would give a messy, newline-separated blob of text. This would be true in a lot of cases. <img width="945" alt="Screenshot 2023-10-26 at 6 15 39 PM" src="https://github.com/langchain-ai/langchain/assets/44193474/eca85d1f-d2bf-4487-a18a-e1e732fadf19"> <img width="1031" alt="Screenshot 2023-10-26 at 6 16 00 PM" src="https://github.com/langchain-ai/langchain/assets/44193474/035938a0-9892-4f6a-83cd-0d7b409b00a3"> Additionally, content from iframes, code from scripts, css from styles, etc. will be gotten if it's a subclass of the selector (which happens more often than you'd think). For example, [this page](https://pydeck.gl/gallery/contour_layer.html#) will scrape 1.5 million characters of content that looks like this: <img width="1372" alt="Screenshot 2023-10-26 at 6 32 55 PM" src="https://github.com/langchain-ai/langchain/assets/44193474/dbd89e39-9478-4a18-9e84-f0eb91954eac"> Therefore, I wrote a recursive _get_clean_text(soup) class function that 1. skips all irrelevant elements, and 2. only adds newlines when necessary. 2. Index pages (like [this one](https://api.python.langchain.com/en/latest/api_reference.html)) would be loaded, chunked, and eventually embedded. This is really bad not just because the user will be embedding irrelevant information - but because index pages are very likely to show up in retrieved content, making retrieval less effective (in our tests). Therefore, I added a bool parameter `exclude_index_pages` defaulted to False (which is the current behavior — although I'd petition to default this to True) that will skip all pages where links take up 50%+ of the page. Through manual testing, this seems to be the best threshold. ## Other Information: - **Issue:** n/a - **Dependencies:** n/a - **Tag maintainer:** n/a - **Twitter handle:** @andrewthezhou --------- Co-authored-by: Andrew Zhou <andrew@heykona.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
3468c038ba
commit
64c4a698a8
@ -1,9 +1,15 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List, Optional, Sequence, Tuple, Union
|
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Sequence, Tuple, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from bs4 import NavigableString
|
||||||
|
from bs4.element import Comment, Tag
|
||||||
|
|
||||||
|
|
||||||
class ReadTheDocsLoader(BaseLoader):
|
class ReadTheDocsLoader(BaseLoader):
|
||||||
"""Load `ReadTheDocs` documentation directory."""
|
"""Load `ReadTheDocs` documentation directory."""
|
||||||
@ -15,7 +21,8 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
errors: Optional[str] = None,
|
errors: Optional[str] = None,
|
||||||
custom_html_tag: Optional[Tuple[str, dict]] = None,
|
custom_html_tag: Optional[Tuple[str, dict]] = None,
|
||||||
patterns: Sequence[str] = ("*.htm", "*.html"),
|
patterns: Sequence[str] = ("*.htm", "*.html"),
|
||||||
**kwargs: Optional[Any]
|
exclude_links_ratio: float = 1.0,
|
||||||
|
**kwargs: Optional[Any],
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize ReadTheDocsLoader
|
Initialize ReadTheDocsLoader
|
||||||
@ -36,6 +43,9 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
custom_html_tag: Optional custom html tag to retrieve the content from
|
custom_html_tag: Optional custom html tag to retrieve the content from
|
||||||
files.
|
files.
|
||||||
patterns: The file patterns to load, passed to `glob.rglob`.
|
patterns: The file patterns to load, passed to `glob.rglob`.
|
||||||
|
exclude_links_ratio: The ratio of links:content to exclude pages from.
|
||||||
|
This is to reduce the frequency at which index pages make their
|
||||||
|
way into retrieved results. Recommended: 0.5
|
||||||
kwargs: named arguments passed to `bs4.BeautifulSoup`.
|
kwargs: named arguments passed to `bs4.BeautifulSoup`.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
@ -48,7 +58,9 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
_ = BeautifulSoup(
|
_ = BeautifulSoup(
|
||||||
"<html><body>Parser builder library test.</body></html>", **kwargs
|
"<html><body>Parser builder library test.</body></html>",
|
||||||
|
"html.parser",
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError("Parsing kwargs do not appear valid") from e
|
raise ValueError("Parsing kwargs do not appear valid") from e
|
||||||
@ -59,24 +71,26 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
self.custom_html_tag = custom_html_tag
|
self.custom_html_tag = custom_html_tag
|
||||||
self.patterns = patterns
|
self.patterns = patterns
|
||||||
self.bs_kwargs = kwargs
|
self.bs_kwargs = kwargs
|
||||||
|
self.exclude_links_ratio = exclude_links_ratio
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load documents."""
|
"""A lazy loader for Documents."""
|
||||||
docs = []
|
|
||||||
for file_pattern in self.patterns:
|
for file_pattern in self.patterns:
|
||||||
for p in self.file_path.rglob(file_pattern):
|
for p in self.file_path.rglob(file_pattern):
|
||||||
if p.is_dir():
|
if p.is_dir():
|
||||||
continue
|
continue
|
||||||
with open(p, encoding=self.encoding, errors=self.errors) as f:
|
with open(p, encoding=self.encoding, errors=self.errors) as f:
|
||||||
text = self._clean_data(f.read())
|
text = self._clean_data(f.read())
|
||||||
metadata = {"source": str(p)}
|
yield Document(page_content=text, metadata={"source": str(p)})
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
|
||||||
return docs
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
return list(self.lazy_load())
|
||||||
|
|
||||||
def _clean_data(self, data: str) -> str:
|
def _clean_data(self, data: str) -> str:
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
soup = BeautifulSoup(data, **self.bs_kwargs)
|
soup = BeautifulSoup(data, "html.parser", **self.bs_kwargs)
|
||||||
|
|
||||||
# default tags
|
# default tags
|
||||||
html_tags = [
|
html_tags = [
|
||||||
@ -87,18 +101,121 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
if self.custom_html_tag is not None:
|
if self.custom_html_tag is not None:
|
||||||
html_tags.append(self.custom_html_tag)
|
html_tags.append(self.custom_html_tag)
|
||||||
|
|
||||||
text = None
|
element = None
|
||||||
|
|
||||||
# reversed order. check the custom one first
|
# reversed order. check the custom one first
|
||||||
for tag, attrs in html_tags[::-1]:
|
for tag, attrs in html_tags[::-1]:
|
||||||
text = soup.find(tag, attrs)
|
element = soup.find(tag, attrs)
|
||||||
# if found, break
|
# if found, break
|
||||||
if text is not None:
|
if element is not None:
|
||||||
break
|
break
|
||||||
|
|
||||||
if text is not None:
|
if element is not None and _get_link_ratio(element) <= self.exclude_links_ratio:
|
||||||
text = text.get_text()
|
text = _get_clean_text(element)
|
||||||
else:
|
else:
|
||||||
text = ""
|
text = ""
|
||||||
# trim empty lines
|
# trim empty lines
|
||||||
return "\n".join([t for t in text.split("\n") if t])
|
return "\n".join([t for t in text.split("\n") if t])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_clean_text(element: Tag) -> str:
|
||||||
|
"""Returns cleaned text with newlines preserved and irrelevant elements removed."""
|
||||||
|
elements_to_skip = [
|
||||||
|
"script",
|
||||||
|
"noscript",
|
||||||
|
"canvas",
|
||||||
|
"meta",
|
||||||
|
"svg",
|
||||||
|
"map",
|
||||||
|
"area",
|
||||||
|
"audio",
|
||||||
|
"source",
|
||||||
|
"track",
|
||||||
|
"video",
|
||||||
|
"embed",
|
||||||
|
"object",
|
||||||
|
"param",
|
||||||
|
"picture",
|
||||||
|
"iframe",
|
||||||
|
"frame",
|
||||||
|
"frameset",
|
||||||
|
"noframes",
|
||||||
|
"applet",
|
||||||
|
"form",
|
||||||
|
"button",
|
||||||
|
"select",
|
||||||
|
"base",
|
||||||
|
"style",
|
||||||
|
"img",
|
||||||
|
]
|
||||||
|
|
||||||
|
newline_elements = [
|
||||||
|
"p",
|
||||||
|
"div",
|
||||||
|
"ul",
|
||||||
|
"ol",
|
||||||
|
"li",
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"h4",
|
||||||
|
"h5",
|
||||||
|
"h6",
|
||||||
|
"pre",
|
||||||
|
"table",
|
||||||
|
"tr",
|
||||||
|
]
|
||||||
|
|
||||||
|
text = _process_element(element, elements_to_skip, newline_elements)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_link_ratio(section: Tag) -> float:
|
||||||
|
links = section.find_all("a")
|
||||||
|
total_text = "".join(str(s) for s in section.stripped_strings)
|
||||||
|
if len(total_text) == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
link_text = "".join(
|
||||||
|
str(string.string.strip())
|
||||||
|
for link in links
|
||||||
|
for string in link.strings
|
||||||
|
if string
|
||||||
|
)
|
||||||
|
return len(link_text) / len(total_text)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_element(
|
||||||
|
element: Union[Tag, NavigableString, Comment],
|
||||||
|
elements_to_skip: List[str],
|
||||||
|
newline_elements: List[str],
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Traverse through HTML tree recursively to preserve newline and skip
|
||||||
|
unwanted (code/binary) elements
|
||||||
|
"""
|
||||||
|
from bs4 import NavigableString
|
||||||
|
from bs4.element import Comment, Tag
|
||||||
|
|
||||||
|
tag_name = getattr(element, "name", None)
|
||||||
|
if isinstance(element, Comment) or tag_name in elements_to_skip:
|
||||||
|
return ""
|
||||||
|
elif isinstance(element, NavigableString):
|
||||||
|
return element
|
||||||
|
elif tag_name == "br":
|
||||||
|
return "\n"
|
||||||
|
elif tag_name in newline_elements:
|
||||||
|
return (
|
||||||
|
"".join(
|
||||||
|
_process_element(child, elements_to_skip, newline_elements)
|
||||||
|
for child in element.children
|
||||||
|
if isinstance(child, (Tag, NavigableString, Comment))
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return "".join(
|
||||||
|
_process_element(child, elements_to_skip, newline_elements)
|
||||||
|
for child in element.children
|
||||||
|
if isinstance(child, (Tag, NavigableString, Comment))
|
||||||
|
)
|
||||||
|
@ -0,0 +1,10 @@
|
|||||||
|
<html>
|
||||||
|
<main id="main-content">
|
||||||
|
Websites:
|
||||||
|
<a href="https://langchain.com">Langchain</a>
|
||||||
|
<a href="https://docs.langchain.com">Langchain Docs</a>
|
||||||
|
<a href="https://api.python.langchain.com/en/latest/api_reference.html"
|
||||||
|
>Langchain API Reference</a
|
||||||
|
>
|
||||||
|
</main>
|
||||||
|
</html>
|
@ -0,0 +1,5 @@
|
|||||||
|
<html>
|
||||||
|
<main id="main-content">
|
||||||
|
Hello <span><em>World</em>!</span>
|
||||||
|
</main>
|
||||||
|
</html>
|
@ -31,6 +31,20 @@ def test_custom() -> None:
|
|||||||
assert len(documents[0].page_content) != 0
|
assert len(documents[0].page_content) != 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_nested_html_structure() -> None:
|
||||||
|
loader = ReadTheDocsLoader(PARENT_DIR / "nested_html_structure")
|
||||||
|
documents = loader.load()
|
||||||
|
assert documents[0].page_content == "Hello World!"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4")
|
||||||
|
def test_index_page() -> None:
|
||||||
|
loader = ReadTheDocsLoader(PARENT_DIR / "index_page", exclude_links_ratio=0.5)
|
||||||
|
documents = loader.load()
|
||||||
|
assert len(documents[0].page_content) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("bs4")
|
@pytest.mark.requires("bs4")
|
||||||
def test_empty() -> None:
|
def test_empty() -> None:
|
||||||
loader = ReadTheDocsLoader(
|
loader = ReadTheDocsLoader(
|
||||||
|
Loading…
Reference in New Issue
Block a user