Bagatur/doc loader confluence (#9524)

Co-authored-by: chanjetsdp <chanjetsdp@chanjet.com>
This commit is contained in:
Bagatur 2023-08-21 12:40:44 -07:00 committed by GitHub
parent de1f63505b
commit dda5b1e370
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -68,6 +68,15 @@ class ConfluenceLoader(BaseLoader):
)
documents = loader.load(space_key="SPACE",limit=50)
# Server on perm
loader = ConfluenceLoader(
url="https://confluence.yoursite.com/",
username="me",
api_key="your_password",
cloud=False
)
documents = loader.load(space_key="SPACE",limit=50)
:param url: _description_
:type url: str
:param api_key: _description_, defaults to None
@ -205,6 +214,7 @@ class ConfluenceLoader(BaseLoader):
max_pages: Optional[int] = 1000,
ocr_languages: Optional[str] = None,
keep_markdown_format: bool = False,
keep_newlines: bool = False,
) -> List[Document]:
"""
:param space_key: Space key retrieved from a confluence URL, defaults to None
@ -237,6 +247,9 @@ class ConfluenceLoader(BaseLoader):
:param keep_markdown_format: Whether to keep the markdown format, defaults to
False
:type keep_markdown_format: bool
:param keep_newlines: Whether to keep the newlines format, defaults to
False
:type keep_newlines: bool
:raises ValueError: _description_
:raises ImportError: _description_
:return: _description_
@ -265,8 +278,9 @@ class ConfluenceLoader(BaseLoader):
include_attachments,
include_comments,
content_format,
ocr_languages,
keep_markdown_format,
ocr_languages=ocr_languages,
keep_markdown_format=keep_markdown_format,
keep_newlines=keep_newlines,
)
if label:
@ -404,6 +418,7 @@ class ConfluenceLoader(BaseLoader):
content_format: ContentFormat,
ocr_languages: Optional[str] = None,
keep_markdown_format: Optional[bool] = False,
keep_newlines: bool = False,
) -> List[Document]:
"""Process a list of pages into a list of documents."""
docs = []
@ -415,8 +430,9 @@ class ConfluenceLoader(BaseLoader):
include_attachments,
include_comments,
content_format,
ocr_languages,
keep_markdown_format,
ocr_languages=ocr_languages,
keep_markdown_format=keep_markdown_format,
keep_newlines=keep_newlines,
)
docs.append(doc)
@ -430,6 +446,7 @@ class ConfluenceLoader(BaseLoader):
content_format: ContentFormat,
ocr_languages: Optional[str] = None,
keep_markdown_format: Optional[bool] = False,
keep_newlines: bool = False,
) -> Document:
if keep_markdown_format:
try:
@ -461,9 +478,14 @@ class ConfluenceLoader(BaseLoader):
else:
content = content_format.get_content(page)
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
attachment_texts
)
if keep_newlines:
text = BeautifulSoup(
content.replace("</p>", "\n</p>").replace("<br />", "\n"), "lxml"
).get_text(" ") + "".join(attachment_texts)
else:
text = BeautifulSoup(content, "lxml").get_text(
" ", strip=True
) + "".join(attachment_texts)
if include_comments:
comments = self.confluence.get_page_comments(