mirror of
https://github.com/hwchase17/langchain.git
synced 2026-01-30 13:50:11 +00:00
Typing, change var, etc
This commit is contained in:
@@ -1,9 +1,13 @@
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Sequence, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from bs4 import NavigableString
|
||||
from bs4.element import Comment, Tag
|
||||
|
||||
|
||||
class ReadTheDocsLoader(BaseLoader):
|
||||
"""Load `ReadTheDocs` documentation directory."""
|
||||
@@ -15,7 +19,7 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
errors: Optional[str] = None,
|
||||
custom_html_tag: Optional[Tuple[str, dict]] = None,
|
||||
patterns: Sequence[str] = ("*.htm", "*.html"),
|
||||
exclude_index_pages: bool = False,
|
||||
exclude_links_ratio: Optional[float] = 1.0,
|
||||
**kwargs: Optional[Any]
|
||||
):
|
||||
"""
|
||||
@@ -37,9 +41,9 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
custom_html_tag: Optional custom html tag to retrieve the content from
|
||||
files.
|
||||
patterns: The file patterns to load, passed to `glob.rglob`.
|
||||
exclude_index_pages: Exclude index pages with high link ratios (>50%).
|
||||
exclude_links_ratio: The ratio of links:content to exclude pages from.
|
||||
This is to reduce the frequency at which index pages make their
|
||||
way into retrieved results.
|
||||
way into retrieved results. Recommended: 0.5
|
||||
kwargs: named arguments passed to `bs4.BeautifulSoup`.
|
||||
"""
|
||||
try:
|
||||
@@ -52,7 +56,9 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
|
||||
try:
|
||||
_ = BeautifulSoup(
|
||||
"<html><body>Parser builder library test.</body></html>", **kwargs
|
||||
"<html><body>Parser builder library test.</body></html>",
|
||||
"html.parser",
|
||||
**kwargs
|
||||
)
|
||||
except Exception as e:
|
||||
raise ValueError("Parsing kwargs do not appear valid") from e
|
||||
@@ -63,7 +69,7 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
self.custom_html_tag = custom_html_tag
|
||||
self.patterns = patterns
|
||||
self.bs_kwargs = kwargs
|
||||
self.exclude_index_pages = exclude_index_pages
|
||||
self.exclude_links_ratio = exclude_links_ratio
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
@@ -78,7 +84,7 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
return docs
|
||||
|
||||
def _get_link_ratio(self, section):
|
||||
def _get_link_ratio(self, section: "Tag"):
|
||||
links = section.find_all("a")
|
||||
total_text = "".join(str(s) for s in section.stripped_strings)
|
||||
if len(total_text) == 0:
|
||||
@@ -92,14 +98,46 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
)
|
||||
return len(link_text) / len(total_text)
|
||||
|
||||
def _get_clean_text(self, element):
|
||||
def _process_element(
|
||||
self,
|
||||
element: Union["Tag", "NavigableString", "Comment"],
|
||||
elements_to_skip: List[str],
|
||||
newline_elements: List[str],
|
||||
):
|
||||
"""
|
||||
Recursive text getter that excludes code and binary content and
|
||||
preserves newline data from the html.
|
||||
Traverse through HTML tree recursively to preserve newline and skip
|
||||
unwanted (code/binary) elements
|
||||
"""
|
||||
from bs4 import NavigableString
|
||||
from bs4.element import Comment
|
||||
from bs4.element import Comment, Tag
|
||||
|
||||
tag_name = getattr(element, "name", None)
|
||||
if isinstance(element, Comment) or tag_name in elements_to_skip:
|
||||
return ""
|
||||
elif isinstance(element, NavigableString):
|
||||
return element
|
||||
elif tag_name == "br":
|
||||
return "\n"
|
||||
elif tag_name in newline_elements:
|
||||
return (
|
||||
"".join(
|
||||
self._process_element(child, elements_to_skip, newline_elements)
|
||||
for child in element.children
|
||||
if isinstance(child, (Tag, NavigableString, Comment))
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
else:
|
||||
return "".join(
|
||||
self._process_element(child, elements_to_skip, newline_elements)
|
||||
for child in element.children
|
||||
if isinstance(child, (Tag, NavigableString, Comment))
|
||||
)
|
||||
|
||||
def _get_clean_text(self, element: "Tag") -> str:
|
||||
"""
|
||||
Returns cleaned text with newlines preserved and irrelevant elements removed
|
||||
"""
|
||||
elements_to_skip = [
|
||||
"script",
|
||||
"noscript",
|
||||
@@ -146,30 +184,13 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
"tr",
|
||||
]
|
||||
|
||||
def process_element(el):
|
||||
"""
|
||||
Traverse through HTML tree recursively to preserve newline and skip
|
||||
unwanted (code/binary) elements
|
||||
"""
|
||||
tag_name = getattr(el, "name", None)
|
||||
if isinstance(el, Comment) or tag_name in elements_to_skip:
|
||||
return ""
|
||||
elif isinstance(el, NavigableString):
|
||||
return el
|
||||
elif tag_name == "br":
|
||||
return "\n"
|
||||
elif tag_name in newline_elements:
|
||||
return "".join(process_element(child) for child in el.children) + "\n"
|
||||
else:
|
||||
return "".join(process_element(child) for child in el.children)
|
||||
|
||||
text = process_element(element)
|
||||
text = self._process_element(element, elements_to_skip, newline_elements)
|
||||
return text.strip()
|
||||
|
||||
def _clean_data(self, data: str) -> str:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(data, **self.bs_kwargs)
|
||||
soup = BeautifulSoup(data, "html.parser", **self.bs_kwargs)
|
||||
|
||||
# default tags
|
||||
html_tags = [
|
||||
@@ -180,19 +201,21 @@ class ReadTheDocsLoader(BaseLoader):
|
||||
if self.custom_html_tag is not None:
|
||||
html_tags.append(self.custom_html_tag)
|
||||
|
||||
text = None
|
||||
element = None
|
||||
|
||||
# reversed order. check the custom one first
|
||||
for tag, attrs in html_tags[::-1]:
|
||||
text = soup.find(tag, attrs)
|
||||
element = soup.find(tag, attrs)
|
||||
# if found, break
|
||||
if text is not None:
|
||||
if element is not None:
|
||||
break
|
||||
|
||||
if text is not None and not (
|
||||
self.exclude_index_pages and self._get_link_ratio(text) >= 0.5
|
||||
if (
|
||||
element is not None
|
||||
and min(self._get_link_ratio(element), 1.0) <= self.exclude_links_ratio
|
||||
):
|
||||
text = self._get_clean_text(text)
|
||||
print("=" * 100, "\n", element, type(element), "\n", "=" * 100)
|
||||
text = self._get_clean_text(element)
|
||||
else:
|
||||
text = ""
|
||||
# trim empty lines
|
||||
|
||||
@@ -35,14 +35,12 @@ def test_custom() -> None:
|
||||
def test_nested_html_structure() -> None:
|
||||
loader = ReadTheDocsLoader(PARENT_DIR / "nested_html_structure")
|
||||
documents = loader.load()
|
||||
assert "\n" not in documents[0].page_content
|
||||
assert documents[0].page_content == "Hello World!"
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_index_page() -> None:
|
||||
loader = ReadTheDocsLoader(
|
||||
PARENT_DIR / "index_page", exclude_index_pages=True
|
||||
)
|
||||
loader = ReadTheDocsLoader(PARENT_DIR / "index_page", exclude_links_ratio=0.5)
|
||||
documents = loader.load()
|
||||
assert len(documents[0].page_content) == 0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user