From e85c53ce68e7d10e931c5ec15b1c5a4a040938ca Mon Sep 17 00:00:00 2001 From: Usama Navid Date: Thu, 9 Feb 2023 05:01:07 +0500 Subject: [PATCH] Update readthedocs.py (#943) Sometimes, the docs may be empty. For example for the text = soup.find_all("main", {"id": "main-content"}) was an empty list. To cater to these edge cases, the clean function needs to be checked if it is empty or not. --- langchain/document_loaders/readthedocs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/langchain/document_loaders/readthedocs.py b/langchain/document_loaders/readthedocs.py index 31d6aa06728..99547f3fb9b 100644 --- a/langchain/document_loaders/readthedocs.py +++ b/langchain/document_loaders/readthedocs.py @@ -19,7 +19,11 @@ class ReadTheDocsLoader(BaseLoader): def _clean_data(data: str) -> str: soup = BeautifulSoup(data) - text = soup.find_all("main", {"id": "main-content"})[0].get_text() + text = soup.find_all("main", {"id": "main-content"}) + if len(text) != 0: + text = text[0].get_text() + else: + text = "" return "\n".join([t for t in text.split("\n") if t]) docs = []