From 05554265b4c3b3c72e240ef541571e0e40d8e5e1 Mon Sep 17 00:00:00 2001
From: Jin Hyung Ahn <zenoengine@gmail.com>
Date: Wed, 15 Jan 2025 23:56:23 +0900
Subject: [PATCH] community: Fix ConfluenceLoader load() failure caused by
 deleted pages (#29232)

## Description
This PR modifies the is_public_page function in ConfluenceLoader to
prevent exceptions caused by deleted pages during the execution of
ConfluenceLoader.process_pages().


**Example scenario:**
Consider the following usage of ConfluenceLoader:
```python
import os
from langchain_community.document_loaders import ConfluenceLoader

loader = ConfluenceLoader(
        url=os.getenv("BASE_URL"),
        token=os.getenv("TOKEN"),
        max_pages=1000,
        cql=f'type=page and lastmodified >= "2020-01-01 00:00"',
        include_restricted_content=False,
)

# Raised Exception : HTTPError: Outdated version/old_draft/trashed? Cannot find content Please provide valid ContentId.
documents = loader.load()
```

If a deleted page exists within the query result, the is_public_page
function would previously raise an exception when calling
get_all_restrictions_for_content, causing the loader.load() process to
fail for all pages.


By adding a pre-check for the page's "current" status, unnecessary API
calls to get_all_restrictions_for_content for non-current pages are
avoided.


This fix ensures that such pages are skipped without affecting the rest
of the loading process.


## Issue
N/A (No specific issue number)

## Dependencies
No new dependencies are introduced with this change.

## Twitter handle
[@zenoengine](https://x.com/zenoengine)
---
 .../langchain_community/document_loaders/confluence.py     | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/confluence.py b/libs/community/langchain_community/document_loaders/confluence.py
index 225488c9ceb..d8c5efa8748 100644
--- a/libs/community/langchain_community/document_loaders/confluence.py
+++ b/libs/community/langchain_community/document_loaders/confluence.py
@@ -523,11 +523,14 @@ class ConfluenceLoader(BaseLoader):
 
     def is_public_page(self, page: dict) -> bool:
         """Check if a page is publicly accessible."""
+
+        if page["status"] != "current":
+            return False
+
         restrictions = self.confluence.get_all_restrictions_for_content(page["id"])
 
         return (
-            page["status"] == "current"
-            and not restrictions["read"]["restrictions"]["user"]["results"]
+            not restrictions["read"]["restrictions"]["user"]["results"]
             and not restrictions["read"]["restrictions"]["group"]["results"]
         )