Bagatur/doc loader confluence (#9524)

Co-authored-by: chanjetsdp <chanjetsdp@chanjet.com>
This commit is contained in:
Bagatur 2023-08-21 12:40:44 -07:00 committed by GitHub
parent de1f63505b
commit dda5b1e370
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -68,6 +68,15 @@ class ConfluenceLoader(BaseLoader):
) )
documents = loader.load(space_key="SPACE",limit=50) documents = loader.load(space_key="SPACE",limit=50)
# Server on perm
loader = ConfluenceLoader(
url="https://confluence.yoursite.com/",
username="me",
api_key="your_password",
cloud=False
)
documents = loader.load(space_key="SPACE",limit=50)
:param url: _description_ :param url: _description_
:type url: str :type url: str
:param api_key: _description_, defaults to None :param api_key: _description_, defaults to None
@ -205,6 +214,7 @@ class ConfluenceLoader(BaseLoader):
max_pages: Optional[int] = 1000, max_pages: Optional[int] = 1000,
ocr_languages: Optional[str] = None, ocr_languages: Optional[str] = None,
keep_markdown_format: bool = False, keep_markdown_format: bool = False,
keep_newlines: bool = False,
) -> List[Document]: ) -> List[Document]:
""" """
:param space_key: Space key retrieved from a confluence URL, defaults to None :param space_key: Space key retrieved from a confluence URL, defaults to None
@ -237,6 +247,9 @@ class ConfluenceLoader(BaseLoader):
:param keep_markdown_format: Whether to keep the markdown format, defaults to :param keep_markdown_format: Whether to keep the markdown format, defaults to
False False
:type keep_markdown_format: bool :type keep_markdown_format: bool
:param keep_newlines: Whether to keep the newlines format, defaults to
False
:type keep_newlines: bool
:raises ValueError: _description_ :raises ValueError: _description_
:raises ImportError: _description_ :raises ImportError: _description_
:return: _description_ :return: _description_
@ -265,8 +278,9 @@ class ConfluenceLoader(BaseLoader):
include_attachments, include_attachments,
include_comments, include_comments,
content_format, content_format,
ocr_languages, ocr_languages=ocr_languages,
keep_markdown_format, keep_markdown_format=keep_markdown_format,
keep_newlines=keep_newlines,
) )
if label: if label:
@ -404,6 +418,7 @@ class ConfluenceLoader(BaseLoader):
content_format: ContentFormat, content_format: ContentFormat,
ocr_languages: Optional[str] = None, ocr_languages: Optional[str] = None,
keep_markdown_format: Optional[bool] = False, keep_markdown_format: Optional[bool] = False,
keep_newlines: bool = False,
) -> List[Document]: ) -> List[Document]:
"""Process a list of pages into a list of documents.""" """Process a list of pages into a list of documents."""
docs = [] docs = []
@ -415,8 +430,9 @@ class ConfluenceLoader(BaseLoader):
include_attachments, include_attachments,
include_comments, include_comments,
content_format, content_format,
ocr_languages, ocr_languages=ocr_languages,
keep_markdown_format, keep_markdown_format=keep_markdown_format,
keep_newlines=keep_newlines,
) )
docs.append(doc) docs.append(doc)
@ -430,6 +446,7 @@ class ConfluenceLoader(BaseLoader):
content_format: ContentFormat, content_format: ContentFormat,
ocr_languages: Optional[str] = None, ocr_languages: Optional[str] = None,
keep_markdown_format: Optional[bool] = False, keep_markdown_format: Optional[bool] = False,
keep_newlines: bool = False,
) -> Document: ) -> Document:
if keep_markdown_format: if keep_markdown_format:
try: try:
@ -461,9 +478,14 @@ class ConfluenceLoader(BaseLoader):
else: else:
content = content_format.get_content(page) content = content_format.get_content(page)
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join( if keep_newlines:
attachment_texts text = BeautifulSoup(
) content.replace("</p>", "\n</p>").replace("<br />", "\n"), "lxml"
).get_text(" ") + "".join(attachment_texts)
else:
text = BeautifulSoup(content, "lxml").get_text(
" ", strip=True
) + "".join(attachment_texts)
if include_comments: if include_comments:
comments = self.confluence.get_page_comments( comments = self.confluence.get_page_comments(