mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-10 15:06:18 +00:00
Update readthedocs.py (#11110)
Only parse .html files .svg .png favicon.ico will crash processing phase --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
70a793ca9d
commit
51193309ea
@ -1,5 +1,5 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List, Optional, Tuple, Union
|
from typing import Any, List, Optional, Sequence, Tuple, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
@ -14,6 +14,7 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
encoding: Optional[str] = None,
|
encoding: Optional[str] = None,
|
||||||
errors: Optional[str] = None,
|
errors: Optional[str] = None,
|
||||||
custom_html_tag: Optional[Tuple[str, dict]] = None,
|
custom_html_tag: Optional[Tuple[str, dict]] = None,
|
||||||
|
patterns: Sequence[str] = ("*.htm", "*.html"),
|
||||||
**kwargs: Optional[Any]
|
**kwargs: Optional[Any]
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -34,6 +35,8 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
cannot be used in binary mode.
|
cannot be used in binary mode.
|
||||||
custom_html_tag: Optional custom html tag to retrieve the content from
|
custom_html_tag: Optional custom html tag to retrieve the content from
|
||||||
files.
|
files.
|
||||||
|
patterns: The file patterns to load, passed to `glob.rglob`.
|
||||||
|
kwargs: named arguments passed to `bs4.BeautifulSoup`.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@ -54,18 +57,20 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
self.errors = errors
|
self.errors = errors
|
||||||
self.custom_html_tag = custom_html_tag
|
self.custom_html_tag = custom_html_tag
|
||||||
|
self.patterns = patterns
|
||||||
self.bs_kwargs = kwargs
|
self.bs_kwargs = kwargs
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load documents."""
|
"""Load documents."""
|
||||||
docs = []
|
docs = []
|
||||||
for p in self.file_path.rglob("*"):
|
for file_pattern in self.patterns:
|
||||||
if p.is_dir():
|
for p in self.file_path.rglob(file_pattern):
|
||||||
continue
|
if p.is_dir():
|
||||||
with open(p, encoding=self.encoding, errors=self.errors) as f:
|
continue
|
||||||
text = self._clean_data(f.read())
|
with open(p, encoding=self.encoding, errors=self.errors) as f:
|
||||||
metadata = {"source": str(p)}
|
text = self._clean_data(f.read())
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
metadata = {"source": str(p)}
|
||||||
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
def _clean_data(self, data: str) -> str:
|
def _clean_data(self, data: str) -> str:
|
||||||
|
Loading…
Reference in New Issue
Block a user