Allow readthedoc loader to pass custom html tag (#5175)

## Description The html structure of readthedocs can differ. Currently, the html tag is hardcoded in the reader, and unable to fit into some cases. This pr includes the following changes: 1. Replace `find_all` with `find` because we just want one tag. 2. Provide `custom_html_tag` to the loader. 3. Add tests for readthedoc loader 4. Refactor code ## Issues See more in https://github.com/hwchase17/langchain/pull/2609. The problem was not completely fixed in that pr. --------- Signed-off-by: byhsu <byhsu@linkedin.com> Co-authored-by: byhsu <byhsu@linkedin.com> Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
2025-08-31 10:23:18 +00:00 · 2023-05-24 10:40:27 -07:00
parent d8eed6018f
commit f0730c6489
5 changed files with 111 additions and 21 deletions
--- a/langchain/document_loaders/readthedocs.py
+++ b/langchain/document_loaders/readthedocs.py
@@ -1,6 +1,6 @@
 """Loader that loads ReadTheDocs documentation directory dump."""
 from pathlib import Path
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Tuple, Union
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
@@ -11,12 +11,31 @@ class ReadTheDocsLoader(BaseLoader):
    def __init__(
        self,
-        path: str,
+        path: Union[str, Path],
        encoding: Optional[str] = None,
        errors: Optional[str] = None,
        custom_html_tag: Optional[Tuple[str, dict]] = None,
        **kwargs: Optional[Any]
    ):
-        """Initialize path."""
+        """
        Initialize ReadTheDocsLoader
        The loader loops over all files under `path` and extract the actual content of
        the files by retrieving main html tags. Default main html tags include
        `<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
        can also define your own html tags by passing custom_html_tag, e.g.
        `("div", "class=main")`. The loader iterates html tags with the order of
        custom html tags (if exists) and default html tags. If any of the tags is not
        empty, the loop will break and retrieve the content out of that tag.
        Args:
            path: The location of pulled readthedocs folder.
            encoding: The encoding with which to open the documents.
            errors: Specifies how encoding and decoding errors are to be handled—this
                cannot be used in binary mode.
            custom_html_tag: Optional custom html tag to retrieve the content from
                files.
        """
        try:
            from bs4 import BeautifulSoup
        except ImportError:
@@ -32,34 +51,50 @@ class ReadTheDocsLoader(BaseLoader):
        except Exception as e:
            raise ValueError("Parsing kwargs do not appear valid") from e
-        self.file_path = path
+        self.file_path = Path(path)
        self.encoding = encoding
        self.errors = errors
        self.custom_html_tag = custom_html_tag
        self.bs_kwargs = kwargs
    def load(self) -> List[Document]:
        """Load documents."""
        from bs4 import BeautifulSoup
        def _clean_data(data: str) -> str:
            soup = BeautifulSoup(data, **self.bs_kwargs)
            text = soup.find_all("main", {"id": "main-content"})
            if len(text) == 0:
                text = soup.find_all("div", {"role": "main"})
            if len(text) != 0:
                text = text[0].get_text()
            else:
                text = ""
            return "\n".join([t for t in text.split("\n") if t])
        docs = []
-        for p in Path(self.file_path).rglob("*"):
+        for p in self.file_path.rglob("*"):
            if p.is_dir():
                continue
            with open(p, encoding=self.encoding, errors=self.errors) as f:
-                text = _clean_data(f.read())
+                text = self._clean_data(f.read())
            metadata = {"source": str(p)}
            docs.append(Document(page_content=text, metadata=metadata))
        return docs
    def _clean_data(self, data: str) -> str:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(data, **self.bs_kwargs)
        # default tags
        html_tags = [
            ("div", {"role": "main"}),
            ("main", {"id": "main-content"}),
        ]
        if self.custom_html_tag is not None:
            html_tags.append(self.custom_html_tag)
        text = None
        # reversed order. check the custom one first
        for tag, attrs in html_tags[::-1]:
            text = soup.find(tag, attrs)
            # if found, break
            if text is not None:
                break
        if text is not None:
            text = text.get_text()
        else:
            text = ""
        # trim empty lines
        return "\n".join([t for t in text.split("\n") if t])
--- a/tests/unit_tests/document_loaders/test_docs/readthedocs/custom/test.html
+++ b/tests/unit_tests/document_loaders/test_docs/readthedocs/custom/test.html
@@ -0,0 +1,5 @@
 <html>
    <article role="main">
    Hello World!
    </article>
 </html>
--- a/tests/unit_tests/document_loaders/test_docs/readthedocs/div_role_main/test.html
+++ b/tests/unit_tests/document_loaders/test_docs/readthedocs/div_role_main/test.html
@@ -0,0 +1,5 @@
 <html>
    <div role="main">
    Hello World!
    </div>
 </html>
--- a/tests/unit_tests/document_loaders/test_docs/readthedocs/main_id_main_content/test.html
+++ b/tests/unit_tests/document_loaders/test_docs/readthedocs/main_id_main_content/test.html
@@ -0,0 +1,5 @@
 <html>
    <main id="main-content">
    Hello World!
    </main>
 </html>
--- a/tests/unit_tests/document_loaders/test_readthedoc.py
+++ b/tests/unit_tests/document_loaders/test_readthedoc.py
@@ -0,0 +1,40 @@
 from pathlib import Path
 import pytest
 from langchain.document_loaders.readthedocs import ReadTheDocsLoader
 PARENT_DIR = Path(__file__).parent / "test_docs" / "readthedocs"
@pytest.mark.requires("bs4")
 def test_main_id_main_content() -> None:
    loader = ReadTheDocsLoader(PARENT_DIR / "main_id_main_content")
    documents = loader.load()
    assert len(documents[0].page_content) != 0
@pytest.mark.requires("bs4")
 def test_div_role_main() -> None:
    loader = ReadTheDocsLoader(PARENT_DIR / "div_role_main")
    documents = loader.load()
    assert len(documents[0].page_content) != 0
@pytest.mark.requires("bs4")
 def test_custom() -> None:
    loader = ReadTheDocsLoader(
        PARENT_DIR / "custom",
        custom_html_tag=("article", {"role": "main"}),
    )
    documents = loader.load()
    assert len(documents[0].page_content) != 0
@pytest.mark.requires("bs4")
 def test_empty() -> None:
    loader = ReadTheDocsLoader(
        PARENT_DIR / "custom",
    )
    documents = loader.load()
    assert len(documents[0].page_content) == 0