mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-02 19:34:04 +00:00
Harrison/confluence stuff (#3765)
Co-authored-by: Jelmer Borst <japborst@gmail.com>
This commit is contained in:
parent
5998b53596
commit
15b92d361d
@ -156,6 +156,7 @@ class ConfluenceLoader(BaseLoader):
|
||||
page_ids: Optional[List[str]] = None,
|
||||
label: Optional[str] = None,
|
||||
cql: Optional[str] = None,
|
||||
include_restricted_content: bool = False,
|
||||
include_archived_content: bool = False,
|
||||
include_attachments: bool = False,
|
||||
include_comments: bool = False,
|
||||
@ -171,6 +172,8 @@ class ConfluenceLoader(BaseLoader):
|
||||
:type label: Optional[str], optional
|
||||
:param cql: CQL Expression, defaults to None
|
||||
:type cql: Optional[str], optional
|
||||
:param include_restricted_content: defaults to False
|
||||
:type include_restricted_content: bool, optional
|
||||
:param include_archived_content: Whether to include archived content,
|
||||
defaults to False
|
||||
:type include_archived_content: bool, optional
|
||||
@ -204,9 +207,9 @@ class ConfluenceLoader(BaseLoader):
|
||||
status="any" if include_archived_content else "current",
|
||||
expand="body.storage.value",
|
||||
)
|
||||
for page in pages:
|
||||
doc = self.process_page(page, include_attachments, include_comments)
|
||||
docs.append(doc)
|
||||
docs += self.process_pages(
|
||||
pages, include_restricted_content, include_attachments, include_comments
|
||||
)
|
||||
|
||||
if label:
|
||||
pages = self.paginate_request(
|
||||
@ -216,9 +219,9 @@ class ConfluenceLoader(BaseLoader):
|
||||
max_pages=max_pages,
|
||||
expand="body.storage.value",
|
||||
)
|
||||
for page in pages:
|
||||
doc = self.process_page(page, include_attachments, include_comments)
|
||||
docs.append(doc)
|
||||
docs += self.process_pages(
|
||||
pages, include_restricted_content, include_attachments, include_comments
|
||||
)
|
||||
|
||||
if cql:
|
||||
pages = self.paginate_request(
|
||||
@ -229,9 +232,9 @@ class ConfluenceLoader(BaseLoader):
|
||||
include_archived_spaces=include_archived_content,
|
||||
expand="body.storage.value",
|
||||
)
|
||||
for page in pages:
|
||||
doc = self.process_page(page, include_attachments, include_comments)
|
||||
docs.append(doc)
|
||||
docs += self.process_pages(
|
||||
pages, include_restricted_content, include_attachments, include_comments
|
||||
)
|
||||
|
||||
if page_ids:
|
||||
for page_id in page_ids:
|
||||
@ -248,6 +251,8 @@ class ConfluenceLoader(BaseLoader):
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)(self.confluence.get_page_by_id)
|
||||
page = get_page(page_id=page_id, expand="body.storage.value")
|
||||
if not include_restricted_content and not self.is_public_page(page):
|
||||
continue
|
||||
doc = self.process_page(page, include_attachments, include_comments)
|
||||
docs.append(doc)
|
||||
|
||||
@ -295,6 +300,33 @@ class ConfluenceLoader(BaseLoader):
|
||||
docs.extend(batch)
|
||||
return docs[:max_pages]
|
||||
|
||||
def is_public_page(self, page: dict) -> bool:
|
||||
"""Check if a page is publicly accessible."""
|
||||
restrictions = self.confluence.get_all_restrictions_for_content(page["id"])
|
||||
|
||||
return (
|
||||
page["status"] == "current"
|
||||
and not restrictions["read"]["restrictions"]["user"]["results"]
|
||||
and not restrictions["read"]["restrictions"]["group"]["results"]
|
||||
)
|
||||
|
||||
def process_pages(
|
||||
self,
|
||||
pages: List[dict],
|
||||
include_restricted_content: bool,
|
||||
include_attachments: bool,
|
||||
include_comments: bool,
|
||||
) -> List[Document]:
|
||||
"""Process a list of pages into a list of documents."""
|
||||
docs = []
|
||||
for page in pages:
|
||||
if not include_restricted_content and not self.is_public_page(page):
|
||||
continue
|
||||
doc = self.process_page(page, include_attachments, include_comments)
|
||||
docs.append(doc)
|
||||
|
||||
return docs
|
||||
|
||||
def process_page(
|
||||
self,
|
||||
page: dict,
|
||||
|
Loading…
Reference in New Issue
Block a user