mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-07 12:06:43 +00:00
community: ConfluenceLoader: add a filter method for attachments (#29882)
Adds a `attachment_filter_func` parameter to the ConfluenceLoader class which can be used to determine which files are indexed. This is useful if you are interested in excluding files based on their media type or other metadata.
This commit is contained in:
parent
9ed47a4d63
commit
a2d05a376c
@ -120,6 +120,9 @@ class ConfluenceLoader(BaseLoader):
|
||||
:type include_archived_content: bool, optional
|
||||
:param include_attachments: defaults to False
|
||||
:type include_attachments: bool, optional
|
||||
:param attachment_filter_func: A function that takes the attachment information
|
||||
from Confluence and decides whether or not the
|
||||
attachment is processed.
|
||||
:param include_comments: defaults to False
|
||||
:type include_comments: bool, optional
|
||||
:param content_format: Specify content format, defaults to
|
||||
@ -176,6 +179,7 @@ class ConfluenceLoader(BaseLoader):
|
||||
ocr_languages: Optional[str] = None,
|
||||
keep_markdown_format: bool = False,
|
||||
keep_newlines: bool = False,
|
||||
attachment_filter_func: Optional[Callable[[dict], bool]] = None,
|
||||
):
|
||||
self.space_key = space_key
|
||||
self.page_ids = page_ids
|
||||
@ -192,6 +196,7 @@ class ConfluenceLoader(BaseLoader):
|
||||
self.ocr_languages = ocr_languages
|
||||
self.keep_markdown_format = keep_markdown_format
|
||||
self.keep_newlines = keep_newlines
|
||||
self.attachment_filter_func = attachment_filter_func
|
||||
|
||||
confluence_kwargs = confluence_kwargs or {}
|
||||
errors = ConfluenceLoader.validate_init_args(
|
||||
@ -660,6 +665,11 @@ class ConfluenceLoader(BaseLoader):
|
||||
attachments = self.confluence.get_attachments_from_content(page_id)["results"]
|
||||
texts = []
|
||||
for attachment in attachments:
|
||||
if self.attachment_filter_func and not self.attachment_filter_func(
|
||||
attachment
|
||||
):
|
||||
continue
|
||||
|
||||
media_type = attachment["metadata"]["mediaType"]
|
||||
absolute_url = self.base_url + attachment["_links"]["download"]
|
||||
title = attachment["title"]
|
||||
|
Loading…
Reference in New Issue
Block a user