From 05554265b4c3b3c72e240ef541571e0e40d8e5e1 Mon Sep 17 00:00:00 2001 From: Jin Hyung Ahn Date: Wed, 15 Jan 2025 23:56:23 +0900 Subject: [PATCH] community: Fix ConfluenceLoader load() failure caused by deleted pages (#29232) ## Description This PR modifies the is_public_page function in ConfluenceLoader to prevent exceptions caused by deleted pages during the execution of ConfluenceLoader.process_pages(). **Example scenario:** Consider the following usage of ConfluenceLoader: ```python import os from langchain_community.document_loaders import ConfluenceLoader loader = ConfluenceLoader( url=os.getenv("BASE_URL"), token=os.getenv("TOKEN"), max_pages=1000, cql=f'type=page and lastmodified >= "2020-01-01 00:00"', include_restricted_content=False, ) # Raised Exception : HTTPError: Outdated version/old_draft/trashed? Cannot find content Please provide valid ContentId. documents = loader.load() ``` If a deleted page exists within the query result, the is_public_page function would previously raise an exception when calling get_all_restrictions_for_content, causing the loader.load() process to fail for all pages. By adding a pre-check for the page's "current" status, unnecessary API calls to get_all_restrictions_for_content for non-current pages are avoided. This fix ensures that such pages are skipped without affecting the rest of the loading process. ## Issue N/A (No specific issue number) ## Dependencies No new dependencies are introduced with this change. ## Twitter handle [@zenoengine](https://x.com/zenoengine) --- .../langchain_community/document_loaders/confluence.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/confluence.py b/libs/community/langchain_community/document_loaders/confluence.py index 225488c9ceb..d8c5efa8748 100644 --- a/libs/community/langchain_community/document_loaders/confluence.py +++ b/libs/community/langchain_community/document_loaders/confluence.py @@ -523,11 +523,14 @@ class ConfluenceLoader(BaseLoader): def is_public_page(self, page: dict) -> bool: """Check if a page is publicly accessible.""" + + if page["status"] != "current": + return False + restrictions = self.confluence.get_all_restrictions_for_content(page["id"]) return ( - page["status"] == "current" - and not restrictions["read"]["restrictions"]["user"]["results"] + not restrictions["read"]["restrictions"]["user"]["results"] and not restrictions["read"]["restrictions"]["group"]["results"] )