Typing, change var, etc

This commit is contained in:
Andrew Zhou
2023-10-27 16:42:20 -04:00
parent f19dc245a7
commit 2c334d4d4f
2 changed files with 61 additions and 40 deletions

View File

@@ -1,9 +1,13 @@
from pathlib import Path
from typing import Any, List, Optional, Sequence, Tuple, Union
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
if TYPE_CHECKING:
from bs4 import NavigableString
from bs4.element import Comment, Tag
class ReadTheDocsLoader(BaseLoader):
"""Load `ReadTheDocs` documentation directory."""
@@ -15,7 +19,7 @@ class ReadTheDocsLoader(BaseLoader):
errors: Optional[str] = None,
custom_html_tag: Optional[Tuple[str, dict]] = None,
patterns: Sequence[str] = ("*.htm", "*.html"),
exclude_index_pages: bool = False,
exclude_links_ratio: Optional[float] = 1.0,
**kwargs: Optional[Any]
):
"""
@@ -37,9 +41,9 @@ class ReadTheDocsLoader(BaseLoader):
custom_html_tag: Optional custom html tag to retrieve the content from
files.
patterns: The file patterns to load, passed to `glob.rglob`.
exclude_index_pages: Exclude index pages with high link ratios (>50%).
exclude_links_ratio: The ratio of links:content to exclude pages from.
This is to reduce the frequency at which index pages make their
way into retrieved results.
way into retrieved results. Recommended: 0.5
kwargs: named arguments passed to `bs4.BeautifulSoup`.
"""
try:
@@ -52,7 +56,9 @@ class ReadTheDocsLoader(BaseLoader):
try:
_ = BeautifulSoup(
"<html><body>Parser builder library test.</body></html>", **kwargs
"<html><body>Parser builder library test.</body></html>",
"html.parser",
**kwargs
)
except Exception as e:
raise ValueError("Parsing kwargs do not appear valid") from e
@@ -63,7 +69,7 @@ class ReadTheDocsLoader(BaseLoader):
self.custom_html_tag = custom_html_tag
self.patterns = patterns
self.bs_kwargs = kwargs
self.exclude_index_pages = exclude_index_pages
self.exclude_links_ratio = exclude_links_ratio
def load(self) -> List[Document]:
"""Load documents."""
@@ -78,7 +84,7 @@ class ReadTheDocsLoader(BaseLoader):
docs.append(Document(page_content=text, metadata=metadata))
return docs
def _get_link_ratio(self, section):
def _get_link_ratio(self, section: "Tag"):
links = section.find_all("a")
total_text = "".join(str(s) for s in section.stripped_strings)
if len(total_text) == 0:
@@ -92,14 +98,46 @@ class ReadTheDocsLoader(BaseLoader):
)
return len(link_text) / len(total_text)
def _get_clean_text(self, element):
def _process_element(
self,
element: Union["Tag", "NavigableString", "Comment"],
elements_to_skip: List[str],
newline_elements: List[str],
):
"""
Recursive text getter that excludes code and binary content and
preserves newline data from the html.
Traverse through HTML tree recursively to preserve newline and skip
unwanted (code/binary) elements
"""
from bs4 import NavigableString
from bs4.element import Comment
from bs4.element import Comment, Tag
tag_name = getattr(element, "name", None)
if isinstance(element, Comment) or tag_name in elements_to_skip:
return ""
elif isinstance(element, NavigableString):
return element
elif tag_name == "br":
return "\n"
elif tag_name in newline_elements:
return (
"".join(
self._process_element(child, elements_to_skip, newline_elements)
for child in element.children
if isinstance(child, (Tag, NavigableString, Comment))
)
+ "\n"
)
else:
return "".join(
self._process_element(child, elements_to_skip, newline_elements)
for child in element.children
if isinstance(child, (Tag, NavigableString, Comment))
)
def _get_clean_text(self, element: "Tag") -> str:
"""
Returns cleaned text with newlines preserved and irrelevant elements removed
"""
elements_to_skip = [
"script",
"noscript",
@@ -146,30 +184,13 @@ class ReadTheDocsLoader(BaseLoader):
"tr",
]
def process_element(el):
"""
Traverse through HTML tree recursively to preserve newline and skip
unwanted (code/binary) elements
"""
tag_name = getattr(el, "name", None)
if isinstance(el, Comment) or tag_name in elements_to_skip:
return ""
elif isinstance(el, NavigableString):
return el
elif tag_name == "br":
return "\n"
elif tag_name in newline_elements:
return "".join(process_element(child) for child in el.children) + "\n"
else:
return "".join(process_element(child) for child in el.children)
text = process_element(element)
text = self._process_element(element, elements_to_skip, newline_elements)
return text.strip()
def _clean_data(self, data: str) -> str:
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, **self.bs_kwargs)
soup = BeautifulSoup(data, "html.parser", **self.bs_kwargs)
# default tags
html_tags = [
@@ -180,19 +201,21 @@ class ReadTheDocsLoader(BaseLoader):
if self.custom_html_tag is not None:
html_tags.append(self.custom_html_tag)
text = None
element = None
# reversed order. check the custom one first
for tag, attrs in html_tags[::-1]:
text = soup.find(tag, attrs)
element = soup.find(tag, attrs)
# if found, break
if text is not None:
if element is not None:
break
if text is not None and not (
self.exclude_index_pages and self._get_link_ratio(text) >= 0.5
if (
element is not None
and min(self._get_link_ratio(element), 1.0) <= self.exclude_links_ratio
):
text = self._get_clean_text(text)
print("=" * 100, "\n", element, type(element), "\n", "=" * 100)
text = self._get_clean_text(element)
else:
text = ""
# trim empty lines

View File

@@ -35,14 +35,12 @@ def test_custom() -> None:
def test_nested_html_structure() -> None:
loader = ReadTheDocsLoader(PARENT_DIR / "nested_html_structure")
documents = loader.load()
assert "\n" not in documents[0].page_content
assert documents[0].page_content == "Hello World!"
@pytest.mark.requires("bs4")
def test_index_page() -> None:
loader = ReadTheDocsLoader(
PARENT_DIR / "index_page", exclude_index_pages=True
)
loader = ReadTheDocsLoader(PARENT_DIR / "index_page", exclude_links_ratio=0.5)
documents = loader.load()
assert len(documents[0].page_content) == 0