Update readthedocs.py (#11110)

Only parse .html files
.svg .png favicon.ico will crash processing phase

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
plpycoin 2023-10-12 23:32:06 +08:00 committed by GitHub
parent 70a793ca9d
commit 51193309ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,5 +1,5 @@
from pathlib import Path from pathlib import Path
from typing import Any, List, Optional, Tuple, Union from typing import Any, List, Optional, Sequence, Tuple, Union
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
@ -14,6 +14,7 @@ class ReadTheDocsLoader(BaseLoader):
encoding: Optional[str] = None, encoding: Optional[str] = None,
errors: Optional[str] = None, errors: Optional[str] = None,
custom_html_tag: Optional[Tuple[str, dict]] = None, custom_html_tag: Optional[Tuple[str, dict]] = None,
patterns: Sequence[str] = ("*.htm", "*.html"),
**kwargs: Optional[Any] **kwargs: Optional[Any]
): ):
""" """
@ -34,6 +35,8 @@ class ReadTheDocsLoader(BaseLoader):
cannot be used in binary mode. cannot be used in binary mode.
custom_html_tag: Optional custom html tag to retrieve the content from custom_html_tag: Optional custom html tag to retrieve the content from
files. files.
patterns: The file patterns to load, passed to `glob.rglob`.
kwargs: named arguments passed to `bs4.BeautifulSoup`.
""" """
try: try:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -54,18 +57,20 @@ class ReadTheDocsLoader(BaseLoader):
self.encoding = encoding self.encoding = encoding
self.errors = errors self.errors = errors
self.custom_html_tag = custom_html_tag self.custom_html_tag = custom_html_tag
self.patterns = patterns
self.bs_kwargs = kwargs self.bs_kwargs = kwargs
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
docs = [] docs = []
for p in self.file_path.rglob("*"): for file_pattern in self.patterns:
if p.is_dir(): for p in self.file_path.rglob(file_pattern):
continue if p.is_dir():
with open(p, encoding=self.encoding, errors=self.errors) as f: continue
text = self._clean_data(f.read()) with open(p, encoding=self.encoding, errors=self.errors) as f:
metadata = {"source": str(p)} text = self._clean_data(f.read())
docs.append(Document(page_content=text, metadata=metadata)) metadata = {"source": str(p)}
docs.append(Document(page_content=text, metadata=metadata))
return docs return docs
def _clean_data(self, data: str) -> str: def _clean_data(self, data: str) -> str: