From 2c334d4d4f0cd1046b901e54e5228eee7bb1d5fc Mon Sep 17 00:00:00 2001 From: Andrew Zhou Date: Fri, 27 Oct 2023 16:42:20 -0400 Subject: [PATCH] Typing, change var, etc --- .../langchain/document_loaders/readthedocs.py | 95 ++++++++++++------- .../document_loaders/test_readthedoc.py | 6 +- 2 files changed, 61 insertions(+), 40 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/readthedocs.py b/libs/langchain/langchain/document_loaders/readthedocs.py index 2d99f8ba31d..ef595de1277 100644 --- a/libs/langchain/langchain/document_loaders/readthedocs.py +++ b/libs/langchain/langchain/document_loaders/readthedocs.py @@ -1,9 +1,13 @@ from pathlib import Path -from typing import Any, List, Optional, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +if TYPE_CHECKING: + from bs4 import NavigableString + from bs4.element import Comment, Tag + class ReadTheDocsLoader(BaseLoader): """Load `ReadTheDocs` documentation directory.""" @@ -15,7 +19,7 @@ class ReadTheDocsLoader(BaseLoader): errors: Optional[str] = None, custom_html_tag: Optional[Tuple[str, dict]] = None, patterns: Sequence[str] = ("*.htm", "*.html"), - exclude_index_pages: bool = False, + exclude_links_ratio: Optional[float] = 1.0, **kwargs: Optional[Any] ): """ @@ -37,9 +41,9 @@ class ReadTheDocsLoader(BaseLoader): custom_html_tag: Optional custom html tag to retrieve the content from files. patterns: The file patterns to load, passed to `glob.rglob`. - exclude_index_pages: Exclude index pages with high link ratios (>50%). + exclude_links_ratio: The ratio of links:content to exclude pages from. This is to reduce the frequency at which index pages make their - way into retrieved results. + way into retrieved results. Recommended: 0.5 kwargs: named arguments passed to `bs4.BeautifulSoup`. """ try: @@ -52,7 +56,9 @@ class ReadTheDocsLoader(BaseLoader): try: _ = BeautifulSoup( - "Parser builder library test.", **kwargs + "Parser builder library test.", + "html.parser", + **kwargs ) except Exception as e: raise ValueError("Parsing kwargs do not appear valid") from e @@ -63,7 +69,7 @@ class ReadTheDocsLoader(BaseLoader): self.custom_html_tag = custom_html_tag self.patterns = patterns self.bs_kwargs = kwargs - self.exclude_index_pages = exclude_index_pages + self.exclude_links_ratio = exclude_links_ratio def load(self) -> List[Document]: """Load documents.""" @@ -78,7 +84,7 @@ class ReadTheDocsLoader(BaseLoader): docs.append(Document(page_content=text, metadata=metadata)) return docs - def _get_link_ratio(self, section): + def _get_link_ratio(self, section: "Tag"): links = section.find_all("a") total_text = "".join(str(s) for s in section.stripped_strings) if len(total_text) == 0: @@ -92,14 +98,46 @@ class ReadTheDocsLoader(BaseLoader): ) return len(link_text) / len(total_text) - def _get_clean_text(self, element): + def _process_element( + self, + element: Union["Tag", "NavigableString", "Comment"], + elements_to_skip: List[str], + newline_elements: List[str], + ): """ - Recursive text getter that excludes code and binary content and - preserves newline data from the html. + Traverse through HTML tree recursively to preserve newline and skip + unwanted (code/binary) elements """ from bs4 import NavigableString - from bs4.element import Comment + from bs4.element import Comment, Tag + tag_name = getattr(element, "name", None) + if isinstance(element, Comment) or tag_name in elements_to_skip: + return "" + elif isinstance(element, NavigableString): + return element + elif tag_name == "br": + return "\n" + elif tag_name in newline_elements: + return ( + "".join( + self._process_element(child, elements_to_skip, newline_elements) + for child in element.children + if isinstance(child, (Tag, NavigableString, Comment)) + ) + + "\n" + ) + else: + return "".join( + self._process_element(child, elements_to_skip, newline_elements) + for child in element.children + if isinstance(child, (Tag, NavigableString, Comment)) + ) + + def _get_clean_text(self, element: "Tag") -> str: + """ + Returns cleaned text with newlines preserved and irrelevant elements removed + """ elements_to_skip = [ "script", "noscript", @@ -146,30 +184,13 @@ class ReadTheDocsLoader(BaseLoader): "tr", ] - def process_element(el): - """ - Traverse through HTML tree recursively to preserve newline and skip - unwanted (code/binary) elements - """ - tag_name = getattr(el, "name", None) - if isinstance(el, Comment) or tag_name in elements_to_skip: - return "" - elif isinstance(el, NavigableString): - return el - elif tag_name == "br": - return "\n" - elif tag_name in newline_elements: - return "".join(process_element(child) for child in el.children) + "\n" - else: - return "".join(process_element(child) for child in el.children) - - text = process_element(element) + text = self._process_element(element, elements_to_skip, newline_elements) return text.strip() def _clean_data(self, data: str) -> str: from bs4 import BeautifulSoup - soup = BeautifulSoup(data, **self.bs_kwargs) + soup = BeautifulSoup(data, "html.parser", **self.bs_kwargs) # default tags html_tags = [ @@ -180,19 +201,21 @@ class ReadTheDocsLoader(BaseLoader): if self.custom_html_tag is not None: html_tags.append(self.custom_html_tag) - text = None + element = None # reversed order. check the custom one first for tag, attrs in html_tags[::-1]: - text = soup.find(tag, attrs) + element = soup.find(tag, attrs) # if found, break - if text is not None: + if element is not None: break - if text is not None and not ( - self.exclude_index_pages and self._get_link_ratio(text) >= 0.5 + if ( + element is not None + and min(self._get_link_ratio(element), 1.0) <= self.exclude_links_ratio ): - text = self._get_clean_text(text) + print("=" * 100, "\n", element, type(element), "\n", "=" * 100) + text = self._get_clean_text(element) else: text = "" # trim empty lines diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py b/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py index c39c1f99635..087bbf3480b 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py @@ -35,14 +35,12 @@ def test_custom() -> None: def test_nested_html_structure() -> None: loader = ReadTheDocsLoader(PARENT_DIR / "nested_html_structure") documents = loader.load() - assert "\n" not in documents[0].page_content + assert documents[0].page_content == "Hello World!" @pytest.mark.requires("bs4") def test_index_page() -> None: - loader = ReadTheDocsLoader( - PARENT_DIR / "index_page", exclude_index_pages=True - ) + loader = ReadTheDocsLoader(PARENT_DIR / "index_page", exclude_links_ratio=0.5) documents = loader.load() assert len(documents[0].page_content) == 0