community: ConfluenceLoader: add a filter method for attachments (#29882)

Adds a `attachment_filter_func` parameter to the ConfluenceLoader class
which can be used to determine which files are indexed. This is useful
if you are interested in excluding files based on their media type or
other metadata.
This commit is contained in:
Fabian Blatz 2025-02-20 00:20:45 +01:00 committed by GitHub
parent 9ed47a4d63
commit a2d05a376c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -120,6 +120,9 @@ class ConfluenceLoader(BaseLoader):
:type include_archived_content: bool, optional
:param include_attachments: defaults to False
:type include_attachments: bool, optional
:param attachment_filter_func: A function that takes the attachment information
from Confluence and decides whether or not the
attachment is processed.
:param include_comments: defaults to False
:type include_comments: bool, optional
:param content_format: Specify content format, defaults to
@ -176,6 +179,7 @@ class ConfluenceLoader(BaseLoader):
ocr_languages: Optional[str] = None,
keep_markdown_format: bool = False,
keep_newlines: bool = False,
attachment_filter_func: Optional[Callable[[dict], bool]] = None,
):
self.space_key = space_key
self.page_ids = page_ids
@ -192,6 +196,7 @@ class ConfluenceLoader(BaseLoader):
self.ocr_languages = ocr_languages
self.keep_markdown_format = keep_markdown_format
self.keep_newlines = keep_newlines
self.attachment_filter_func = attachment_filter_func
confluence_kwargs = confluence_kwargs or {}
errors = ConfluenceLoader.validate_init_args(
@ -660,6 +665,11 @@ class ConfluenceLoader(BaseLoader):
attachments = self.confluence.get_attachments_from_content(page_id)["results"]
texts = []
for attachment in attachments:
if self.attachment_filter_func and not self.attachment_filter_func(
attachment
):
continue
media_type = attachment["metadata"]["mediaType"]
absolute_url = self.base_url + attachment["_links"]["download"]
title = attachment["title"]