mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 12:48:12 +00:00
fix: ReadTheDocs loader main content filter (#2609)
It seems the main element wrapper changed in ReadTheDocs website or for some reason it's different for me ? This adds an extra filter for the main content wrapper if the first one returns no text.  Co-authored-by: blob42 <spike@w530>
This commit is contained in:
parent
aaac7071a3
commit
54b1645d13
@ -45,6 +45,10 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
def _clean_data(data: str) -> str:
|
def _clean_data(data: str) -> str:
|
||||||
soup = BeautifulSoup(data, **self.bs_kwargs)
|
soup = BeautifulSoup(data, **self.bs_kwargs)
|
||||||
text = soup.find_all("main", {"id": "main-content"})
|
text = soup.find_all("main", {"id": "main-content"})
|
||||||
|
|
||||||
|
if len(text) == 0:
|
||||||
|
text = soup.find_all("div", {"role": "main"})
|
||||||
|
|
||||||
if len(text) != 0:
|
if len(text) != 0:
|
||||||
text = text[0].get_text()
|
text = text[0].get_text()
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user