Typing, change var, etc

2026-01-30 13:50:11 +00:00 · 2023-10-27 16:42:20 -04:00
parent f19dc245a7
commit 2c334d4d4f
2 changed files with 61 additions and 40 deletions
--- a/libs/langchain/langchain/document_loaders/readthedocs.py
+++ b/libs/langchain/langchain/document_loaders/readthedocs.py
@@ -1,9 +1,13 @@
 from pathlib import Path
-from typing import Any, List, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union

 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader

+if TYPE_CHECKING:
+    from bs4 import NavigableString
+    from bs4.element import Comment, Tag
+

 class ReadTheDocsLoader(BaseLoader):
    """Load `ReadTheDocs` documentation directory."""
@@ -15,7 +19,7 @@ class ReadTheDocsLoader(BaseLoader):
        errors: Optional[str] = None,
        custom_html_tag: Optional[Tuple[str, dict]] = None,
        patterns: Sequence[str] = ("*.htm", "*.html"),
-        exclude_index_pages: bool = False,
+        exclude_links_ratio: Optional[float] = 1.0,
        **kwargs: Optional[Any]
    ):
        """
@@ -37,9 +41,9 @@ class ReadTheDocsLoader(BaseLoader):
            custom_html_tag: Optional custom html tag to retrieve the content from
                files.
            patterns: The file patterns to load, passed to `glob.rglob`.
-            exclude_index_pages: Exclude index pages with high link ratios (>50%).
+            exclude_links_ratio: The ratio of links:content to exclude pages from.
                This is to reduce the frequency at which index pages make their
-                way into retrieved results.
+                way into retrieved results. Recommended: 0.5
            kwargs: named arguments passed to `bs4.BeautifulSoup`.
        """
        try:
@@ -52,7 +56,9 @@ class ReadTheDocsLoader(BaseLoader):

        try:
            _ = BeautifulSoup(
-                "<html><body>Parser builder library test.</body></html>", **kwargs
+                "<html><body>Parser builder library test.</body></html>",
+                "html.parser",
+                **kwargs
            )
        except Exception as e:
            raise ValueError("Parsing kwargs do not appear valid") from e
@@ -63,7 +69,7 @@ class ReadTheDocsLoader(BaseLoader):
        self.custom_html_tag = custom_html_tag
        self.patterns = patterns
        self.bs_kwargs = kwargs
-        self.exclude_index_pages = exclude_index_pages
+        self.exclude_links_ratio = exclude_links_ratio

    def load(self) -> List[Document]:
        """Load documents."""
@@ -78,7 +84,7 @@ class ReadTheDocsLoader(BaseLoader):
                docs.append(Document(page_content=text, metadata=metadata))
        return docs

-    def _get_link_ratio(self, section):
+    def _get_link_ratio(self, section: "Tag"):
        links = section.find_all("a")
        total_text = "".join(str(s) for s in section.stripped_strings)
        if len(total_text) == 0:
@@ -92,14 +98,46 @@ class ReadTheDocsLoader(BaseLoader):
        )
        return len(link_text) / len(total_text)

-    def _get_clean_text(self, element):
+    def _process_element(
+        self,
+        element: Union["Tag", "NavigableString", "Comment"],
+        elements_to_skip: List[str],
+        newline_elements: List[str],
+    ):
        """
-        Recursive text getter that excludes code and binary content and
-        preserves newline data from the html.
+        Traverse through HTML tree recursively to preserve newline and skip
+        unwanted (code/binary) elements
        """
        from bs4 import NavigableString
-        from bs4.element import Comment
+        from bs4.element import Comment, Tag

+        tag_name = getattr(element, "name", None)
+        if isinstance(element, Comment) or tag_name in elements_to_skip:
+            return ""
+        elif isinstance(element, NavigableString):
+            return element
+        elif tag_name == "br":
+            return "\n"
+        elif tag_name in newline_elements:
+            return (
+                "".join(
+                    self._process_element(child, elements_to_skip, newline_elements)
+                    for child in element.children
+                    if isinstance(child, (Tag, NavigableString, Comment))
+                )
+                + "\n"
+            )
+        else:
+            return "".join(
+                self._process_element(child, elements_to_skip, newline_elements)
+                for child in element.children
+                if isinstance(child, (Tag, NavigableString, Comment))
+            )
+
+    def _get_clean_text(self, element: "Tag") -> str:
+        """
+        Returns cleaned text with newlines preserved and irrelevant elements removed
+        """
        elements_to_skip = [
            "script",
            "noscript",
@@ -146,30 +184,13 @@ class ReadTheDocsLoader(BaseLoader):
            "tr",
        ]

-        def process_element(el):
-            """
-            Traverse through HTML tree recursively to preserve newline and skip
-            unwanted (code/binary) elements
-            """
-            tag_name = getattr(el, "name", None)
-            if isinstance(el, Comment) or tag_name in elements_to_skip:
-                return ""
-            elif isinstance(el, NavigableString):
-                return el
-            elif tag_name == "br":
-                return "\n"
-            elif tag_name in newline_elements:
-                return "".join(process_element(child) for child in el.children) + "\n"
-            else:
-                return "".join(process_element(child) for child in el.children)
-
-        text = process_element(element)
+        text = self._process_element(element, elements_to_skip, newline_elements)
        return text.strip()

    def _clean_data(self, data: str) -> str:
        from bs4 import BeautifulSoup

-        soup = BeautifulSoup(data, **self.bs_kwargs)
+        soup = BeautifulSoup(data, "html.parser", **self.bs_kwargs)

        # default tags
        html_tags = [
@@ -180,19 +201,21 @@ class ReadTheDocsLoader(BaseLoader):
        if self.custom_html_tag is not None:
            html_tags.append(self.custom_html_tag)

-        text = None
+        element = None

        # reversed order. check the custom one first
        for tag, attrs in html_tags[::-1]:
-            text = soup.find(tag, attrs)
+            element = soup.find(tag, attrs)
            # if found, break
-            if text is not None:
+            if element is not None:
                break

-        if text is not None and not (
-            self.exclude_index_pages and self._get_link_ratio(text) >= 0.5
+        if (
+            element is not None
+            and min(self._get_link_ratio(element), 1.0) <= self.exclude_links_ratio
        ):
-            text = self._get_clean_text(text)
+            print("=" * 100, "\n", element, type(element), "\n", "=" * 100)
+            text = self._get_clean_text(element)
        else:
            text = ""
        # trim empty lines
--- a/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py
@@ -35,14 +35,12 @@ def test_custom() -> None:
 def test_nested_html_structure() -> None:
    loader = ReadTheDocsLoader(PARENT_DIR / "nested_html_structure")
    documents = loader.load()
-    assert "\n" not in documents[0].page_content
+    assert documents[0].page_content == "Hello World!"


@pytest.mark.requires("bs4")
 def test_index_page() -> None:
-    loader = ReadTheDocsLoader(
-        PARENT_DIR / "index_page", exclude_index_pages=True
-    )
+    loader = ReadTheDocsLoader(PARENT_DIR / "index_page", exclude_links_ratio=0.5)
    documents = loader.load()
    assert len(documents[0].page_content) == 0