mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-14 17:07:25 +00:00
Confluence beautifulsoup (#3576)
Co-authored-by: Theau Heral <theau.heral@ln.email.gs.com>
This commit is contained in:
parent
64501329ab
commit
85dae78548
@ -189,19 +189,8 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
"`label`, `cql` parameters."
|
"`label`, `cql` parameters."
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
|
||||||
import html2text # type: ignore
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(
|
|
||||||
"`html2text` package not found, please run `pip install html2text`"
|
|
||||||
)
|
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
|
|
||||||
text_maker = html2text.HTML2Text()
|
|
||||||
text_maker.ignore_links = True
|
|
||||||
text_maker.ignore_images = True
|
|
||||||
|
|
||||||
if space_key:
|
if space_key:
|
||||||
pages = self.paginate_request(
|
pages = self.paginate_request(
|
||||||
self.confluence.get_all_pages_from_space,
|
self.confluence.get_all_pages_from_space,
|
||||||
@ -211,9 +200,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
expand="body.storage.value",
|
expand="body.storage.value",
|
||||||
)
|
)
|
||||||
for page in pages:
|
for page in pages:
|
||||||
doc = self.process_page(
|
doc = self.process_page(page, include_attachments, include_comments)
|
||||||
page, include_attachments, include_comments, text_maker
|
|
||||||
)
|
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
if label:
|
if label:
|
||||||
@ -225,9 +212,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
expand="body.storage.value",
|
expand="body.storage.value",
|
||||||
)
|
)
|
||||||
for page in pages:
|
for page in pages:
|
||||||
doc = self.process_page(
|
doc = self.process_page(page, include_attachments, include_comments)
|
||||||
page, include_attachments, include_comments, text_maker
|
|
||||||
)
|
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
if cql:
|
if cql:
|
||||||
@ -239,9 +224,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
expand="body.storage.value",
|
expand="body.storage.value",
|
||||||
)
|
)
|
||||||
for page in pages:
|
for page in pages:
|
||||||
doc = self.process_page(
|
doc = self.process_page(page, include_attachments, include_comments)
|
||||||
page, include_attachments, include_comments, text_maker
|
|
||||||
)
|
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
if page_ids:
|
if page_ids:
|
||||||
@ -259,9 +242,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||||
)(self.confluence.get_page_by_id)
|
)(self.confluence.get_page_by_id)
|
||||||
page = get_page(page_id=page_id, expand="body.storage.value")
|
page = get_page(page_id=page_id, expand="body.storage.value")
|
||||||
doc = self.process_page(
|
doc = self.process_page(page, include_attachments, include_comments)
|
||||||
page, include_attachments, include_comments, text_maker
|
|
||||||
)
|
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
return docs
|
return docs
|
||||||
@ -313,21 +294,28 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
page: dict,
|
page: dict,
|
||||||
include_attachments: bool,
|
include_attachments: bool,
|
||||||
include_comments: bool,
|
include_comments: bool,
|
||||||
text_maker: Any,
|
|
||||||
) -> Document:
|
) -> Document:
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`beautifulsoup4` package not found, please run"
|
||||||
|
" `pip install beautifulsoup4`"
|
||||||
|
)
|
||||||
|
|
||||||
if include_attachments:
|
if include_attachments:
|
||||||
attachment_texts = self.process_attachment(page["id"])
|
attachment_texts = self.process_attachment(page["id"])
|
||||||
else:
|
else:
|
||||||
attachment_texts = []
|
attachment_texts = []
|
||||||
text = text_maker.handle(page["body"]["storage"]["value"]) + "".join(
|
text = BeautifulSoup(
|
||||||
attachment_texts
|
page["body"]["storage"]["value"], "lxml"
|
||||||
)
|
).get_text() + "".join(attachment_texts)
|
||||||
if include_comments:
|
if include_comments:
|
||||||
comments = self.confluence.get_page_comments(
|
comments = self.confluence.get_page_comments(
|
||||||
page["id"], expand="body.view.value", depth="all"
|
page["id"], expand="body.view.value", depth="all"
|
||||||
)["results"]
|
)["results"]
|
||||||
comment_texts = [
|
comment_texts = [
|
||||||
text_maker.handle(comment["body"]["view"]["value"])
|
BeautifulSoup(comment["body"]["view"]["value"], "lxml").get_text()
|
||||||
for comment in comments
|
for comment in comments
|
||||||
]
|
]
|
||||||
text = text + "".join(comment_texts)
|
text = text + "".join(comment_texts)
|
||||||
|
Loading…
Reference in New Issue
Block a user