mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-13 06:40:04 +00:00
community: add include_labels option to ConfluenceLoader (#28259)
## **Description:** Enable `ConfluenceLoader` to include labels with `include_labels` option (`false` by default for backward compatibility). and the labels are set to `metadata` in the `Document`. e.g. `{"labels": ["l1", "l2"]}` ## Notes Confluence API supports to get labels by providing `metadata.labels` to `expand` query parameter All of the following functions support `expand` in the same way: - confluence.get_page_by_id - confluence.get_all_pages_by_label - confluence.get_all_pages_from_space - cql (internally using [/api/content/search](https://developer.atlassian.com/cloud/confluence/rest/v1/api-group-content/#api-wiki-rest-api-content-search-get)) ## **Issue:** No issue related to this PR. ## **Dependencies:** No changes. ## **Twitter handle:** [@gymnstcs](https://x.com/gymnstcs) - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
242fee11be
commit
ce3b69aa05
@ -166,6 +166,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
include_archived_content: bool = False,
|
include_archived_content: bool = False,
|
||||||
include_attachments: bool = False,
|
include_attachments: bool = False,
|
||||||
include_comments: bool = False,
|
include_comments: bool = False,
|
||||||
|
include_labels: bool = False,
|
||||||
content_format: ContentFormat = ContentFormat.STORAGE,
|
content_format: ContentFormat = ContentFormat.STORAGE,
|
||||||
limit: Optional[int] = 50,
|
limit: Optional[int] = 50,
|
||||||
max_pages: Optional[int] = 1000,
|
max_pages: Optional[int] = 1000,
|
||||||
@ -181,6 +182,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
self.include_archived_content = include_archived_content
|
self.include_archived_content = include_archived_content
|
||||||
self.include_attachments = include_attachments
|
self.include_attachments = include_attachments
|
||||||
self.include_comments = include_comments
|
self.include_comments = include_comments
|
||||||
|
self.include_labels = include_labels
|
||||||
self.content_format = content_format
|
self.content_format = content_format
|
||||||
self.limit = limit
|
self.limit = limit
|
||||||
self.max_pages = max_pages
|
self.max_pages = max_pages
|
||||||
@ -327,12 +329,20 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
include_attachments = self._resolve_param("include_attachments", kwargs)
|
include_attachments = self._resolve_param("include_attachments", kwargs)
|
||||||
include_comments = self._resolve_param("include_comments", kwargs)
|
include_comments = self._resolve_param("include_comments", kwargs)
|
||||||
|
include_labels = self._resolve_param("include_labels", kwargs)
|
||||||
content_format = self._resolve_param("content_format", kwargs)
|
content_format = self._resolve_param("content_format", kwargs)
|
||||||
limit = self._resolve_param("limit", kwargs)
|
limit = self._resolve_param("limit", kwargs)
|
||||||
max_pages = self._resolve_param("max_pages", kwargs)
|
max_pages = self._resolve_param("max_pages", kwargs)
|
||||||
ocr_languages = self._resolve_param("ocr_languages", kwargs)
|
ocr_languages = self._resolve_param("ocr_languages", kwargs)
|
||||||
keep_markdown_format = self._resolve_param("keep_markdown_format", kwargs)
|
keep_markdown_format = self._resolve_param("keep_markdown_format", kwargs)
|
||||||
keep_newlines = self._resolve_param("keep_newlines", kwargs)
|
keep_newlines = self._resolve_param("keep_newlines", kwargs)
|
||||||
|
expand = ",".join(
|
||||||
|
[
|
||||||
|
content_format.value,
|
||||||
|
"version",
|
||||||
|
*(["metadata.labels"] if include_labels else []),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
if not space_key and not page_ids and not label and not cql:
|
if not space_key and not page_ids and not label and not cql:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -347,13 +357,14 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
limit=limit,
|
limit=limit,
|
||||||
max_pages=max_pages,
|
max_pages=max_pages,
|
||||||
status="any" if include_archived_content else "current",
|
status="any" if include_archived_content else "current",
|
||||||
expand=f"{content_format.value},version",
|
expand=expand,
|
||||||
)
|
)
|
||||||
yield from self.process_pages(
|
yield from self.process_pages(
|
||||||
pages,
|
pages,
|
||||||
include_restricted_content,
|
include_restricted_content,
|
||||||
include_attachments,
|
include_attachments,
|
||||||
include_comments,
|
include_comments,
|
||||||
|
include_labels,
|
||||||
content_format,
|
content_format,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
keep_markdown_format=keep_markdown_format,
|
keep_markdown_format=keep_markdown_format,
|
||||||
@ -380,13 +391,14 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
limit=limit,
|
limit=limit,
|
||||||
max_pages=max_pages,
|
max_pages=max_pages,
|
||||||
include_archived_spaces=include_archived_content,
|
include_archived_spaces=include_archived_content,
|
||||||
expand=f"{content_format.value},version",
|
expand=expand,
|
||||||
)
|
)
|
||||||
yield from self.process_pages(
|
yield from self.process_pages(
|
||||||
pages,
|
pages,
|
||||||
include_restricted_content,
|
include_restricted_content,
|
||||||
include_attachments,
|
include_attachments,
|
||||||
include_comments,
|
include_comments,
|
||||||
|
False, # labels are not included in the search results
|
||||||
content_format,
|
content_format,
|
||||||
ocr_languages,
|
ocr_languages,
|
||||||
keep_markdown_format,
|
keep_markdown_format,
|
||||||
@ -408,7 +420,8 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||||
)(self.confluence.get_page_by_id)
|
)(self.confluence.get_page_by_id)
|
||||||
page = get_page(
|
page = get_page(
|
||||||
page_id=page_id, expand=f"{content_format.value},version"
|
page_id=page_id,
|
||||||
|
expand=expand,
|
||||||
)
|
)
|
||||||
if not include_restricted_content and not self.is_public_page(page):
|
if not include_restricted_content and not self.is_public_page(page):
|
||||||
continue
|
continue
|
||||||
@ -416,6 +429,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
page,
|
page,
|
||||||
include_attachments,
|
include_attachments,
|
||||||
include_comments,
|
include_comments,
|
||||||
|
include_labels,
|
||||||
content_format,
|
content_format,
|
||||||
ocr_languages,
|
ocr_languages,
|
||||||
keep_markdown_format,
|
keep_markdown_format,
|
||||||
@ -498,6 +512,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
include_restricted_content: bool,
|
include_restricted_content: bool,
|
||||||
include_attachments: bool,
|
include_attachments: bool,
|
||||||
include_comments: bool,
|
include_comments: bool,
|
||||||
|
include_labels: bool,
|
||||||
content_format: ContentFormat,
|
content_format: ContentFormat,
|
||||||
ocr_languages: Optional[str] = None,
|
ocr_languages: Optional[str] = None,
|
||||||
keep_markdown_format: Optional[bool] = False,
|
keep_markdown_format: Optional[bool] = False,
|
||||||
@ -511,6 +526,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
page,
|
page,
|
||||||
include_attachments,
|
include_attachments,
|
||||||
include_comments,
|
include_comments,
|
||||||
|
include_labels,
|
||||||
content_format,
|
content_format,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
keep_markdown_format=keep_markdown_format,
|
keep_markdown_format=keep_markdown_format,
|
||||||
@ -522,6 +538,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
page: dict,
|
page: dict,
|
||||||
include_attachments: bool,
|
include_attachments: bool,
|
||||||
include_comments: bool,
|
include_comments: bool,
|
||||||
|
include_labels: bool,
|
||||||
content_format: ContentFormat,
|
content_format: ContentFormat,
|
||||||
ocr_languages: Optional[str] = None,
|
ocr_languages: Optional[str] = None,
|
||||||
keep_markdown_format: Optional[bool] = False,
|
keep_markdown_format: Optional[bool] = False,
|
||||||
@ -575,10 +592,19 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
]
|
]
|
||||||
text = text + "".join(comment_texts)
|
text = text + "".join(comment_texts)
|
||||||
|
|
||||||
|
if include_labels:
|
||||||
|
labels = [
|
||||||
|
label["name"]
|
||||||
|
for label in page.get("metadata", {})
|
||||||
|
.get("labels", {})
|
||||||
|
.get("results", [])
|
||||||
|
]
|
||||||
|
|
||||||
metadata = {
|
metadata = {
|
||||||
"title": page["title"],
|
"title": page["title"],
|
||||||
"id": page["id"],
|
"id": page["id"],
|
||||||
"source": self.base_url.strip("/") + page["_links"]["webui"],
|
"source": self.base_url.strip("/") + page["_links"]["webui"],
|
||||||
|
**({"labels": labels} if include_labels else {}),
|
||||||
}
|
}
|
||||||
|
|
||||||
if "version" in page and "when" in page["version"]:
|
if "version" in page and "when" in page["version"]:
|
||||||
|
@ -195,6 +195,36 @@ class TestConfluenceLoader:
|
|||||||
assert mock_confluence.cql.call_count == 0
|
assert mock_confluence.cql.call_count == 0
|
||||||
assert mock_confluence.get_page_child_by_type.call_count == 0
|
assert mock_confluence.get_page_child_by_type.call_count == 0
|
||||||
|
|
||||||
|
@pytest.mark.requires("markdownify")
|
||||||
|
def test_confluence_loader_when_include_lables_set_to_true(
|
||||||
|
self, mock_confluence: MagicMock
|
||||||
|
) -> None:
|
||||||
|
# one response with two pages
|
||||||
|
mock_confluence.get_all_pages_from_space.return_value = [
|
||||||
|
self._get_mock_page("123", include_labels=True),
|
||||||
|
self._get_mock_page("456", include_labels=False),
|
||||||
|
]
|
||||||
|
mock_confluence.get_all_restrictions_for_content.side_effect = [
|
||||||
|
self._get_mock_page_restrictions("123"),
|
||||||
|
self._get_mock_page_restrictions("456"),
|
||||||
|
]
|
||||||
|
|
||||||
|
conflence_loader = self._get_mock_confluence_loader(
|
||||||
|
mock_confluence,
|
||||||
|
space_key=self.MOCK_SPACE_KEY,
|
||||||
|
include_labels=True,
|
||||||
|
max_pages=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
documents = conflence_loader.load()
|
||||||
|
|
||||||
|
assert mock_confluence.get_all_pages_from_space.call_count == 1
|
||||||
|
|
||||||
|
assert len(documents) == 2
|
||||||
|
assert all(isinstance(doc, Document) for doc in documents)
|
||||||
|
assert documents[0].metadata["labels"] == ["l1", "l2"]
|
||||||
|
assert documents[1].metadata["labels"] == []
|
||||||
|
|
||||||
def _get_mock_confluence_loader(
|
def _get_mock_confluence_loader(
|
||||||
self, mock_confluence: MagicMock, **kwargs: Any
|
self, mock_confluence: MagicMock, **kwargs: Any
|
||||||
) -> ConfluenceLoader:
|
) -> ConfluenceLoader:
|
||||||
@ -208,7 +238,10 @@ class TestConfluenceLoader:
|
|||||||
return confluence_loader
|
return confluence_loader
|
||||||
|
|
||||||
def _get_mock_page(
|
def _get_mock_page(
|
||||||
self, page_id: str, content_format: ContentFormat = ContentFormat.STORAGE
|
self,
|
||||||
|
page_id: str,
|
||||||
|
content_format: ContentFormat = ContentFormat.STORAGE,
|
||||||
|
include_labels: bool = False,
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
return {
|
return {
|
||||||
"id": f"{page_id}",
|
"id": f"{page_id}",
|
||||||
@ -216,6 +249,20 @@ class TestConfluenceLoader:
|
|||||||
"body": {
|
"body": {
|
||||||
f"{content_format.name.lower()}": {"value": f"<p>Content {page_id}</p>"}
|
f"{content_format.name.lower()}": {"value": f"<p>Content {page_id}</p>"}
|
||||||
},
|
},
|
||||||
|
**(
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"labels": {
|
||||||
|
"results": [
|
||||||
|
{"prefix": "global", "name": "l1", "id": "111"},
|
||||||
|
{"prefix": "global", "name": "l2", "id": "222"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if include_labels
|
||||||
|
else {},
|
||||||
|
}
|
||||||
|
),
|
||||||
"status": "current",
|
"status": "current",
|
||||||
"type": "page",
|
"type": "page",
|
||||||
"_links": {
|
"_links": {
|
||||||
|
Loading…
Reference in New Issue
Block a user