mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-06 13:18:12 +00:00
Several confluence loader improvements (#3300)
This PR addresses several improvements: - Previously it was not possible to load spaces of more than 100 pages. The `limit` was being used both as an overall page limit *and* as a per request pagination limit. This, in combination with the fact that atlassian seem to use a server-side hard limit of 100 when page content is expanded, meant it wasn't possible to download >100 pages. Now `limit` is used *only* as a per-request pagination limit and `max_pages` is introduced as the way to limit the total number of pages returned by the paginator. - Document metadata now includes `source` (the source url), making it compatible with `RetrievalQAWithSourcesChain`. - It is now possible to include inline and footer comments. - It is now possible to pass `verify_ssl=False` and other parameters to the confluence object for use cases that require it.
This commit is contained in:
parent
651cb62556
commit
b4de839ed8
@ -60,6 +60,8 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
:type min_retry_seconds: Optional[int], optional
|
:type min_retry_seconds: Optional[int], optional
|
||||||
:param max_retry_seconds: defaults to 10
|
:param max_retry_seconds: defaults to 10
|
||||||
:type max_retry_seconds: Optional[int], optional
|
:type max_retry_seconds: Optional[int], optional
|
||||||
|
:param confluence_kwargs: additional kwargs to initialize confluence with
|
||||||
|
:type confluence_kwargs: dict, optional
|
||||||
:raises ValueError: Errors while validating input
|
:raises ValueError: Errors while validating input
|
||||||
:raises ImportError: Required dependencies not installed.
|
:raises ImportError: Required dependencies not installed.
|
||||||
"""
|
"""
|
||||||
@ -74,7 +76,9 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
number_of_retries: Optional[int] = 3,
|
number_of_retries: Optional[int] = 3,
|
||||||
min_retry_seconds: Optional[int] = 2,
|
min_retry_seconds: Optional[int] = 2,
|
||||||
max_retry_seconds: Optional[int] = 10,
|
max_retry_seconds: Optional[int] = 10,
|
||||||
|
confluence_kwargs: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
|
confluence_kwargs = confluence_kwargs or {}
|
||||||
errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
|
errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
|
||||||
if errors:
|
if errors:
|
||||||
raise ValueError(f"Error(s) while validating input: {errors}")
|
raise ValueError(f"Error(s) while validating input: {errors}")
|
||||||
@ -93,10 +97,16 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if oauth2:
|
if oauth2:
|
||||||
self.confluence = Confluence(url=url, oauth2=oauth2, cloud=cloud)
|
self.confluence = Confluence(
|
||||||
|
url=url, oauth2=oauth2, cloud=cloud, **confluence_kwargs
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.confluence = Confluence(
|
self.confluence = Confluence(
|
||||||
url=url, username=username, password=api_key, cloud=cloud
|
url=url,
|
||||||
|
username=username,
|
||||||
|
password=api_key,
|
||||||
|
cloud=cloud,
|
||||||
|
**confluence_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -147,7 +157,9 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
label: Optional[str] = None,
|
label: Optional[str] = None,
|
||||||
cql: Optional[str] = None,
|
cql: Optional[str] = None,
|
||||||
include_attachments: bool = False,
|
include_attachments: bool = False,
|
||||||
|
include_comments: bool = False,
|
||||||
limit: Optional[int] = 50,
|
limit: Optional[int] = 50,
|
||||||
|
max_pages: Optional[int] = 1000,
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""
|
"""
|
||||||
:param space_key: Space key retrieved from a confluence URL, defaults to None
|
:param space_key: Space key retrieved from a confluence URL, defaults to None
|
||||||
@ -160,8 +172,12 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
:type cql: Optional[str], optional
|
:type cql: Optional[str], optional
|
||||||
:param include_attachments: defaults to False
|
:param include_attachments: defaults to False
|
||||||
:type include_attachments: bool, optional
|
:type include_attachments: bool, optional
|
||||||
:param limit: Maximum number of pages to retrieve, defaults to 50
|
:param include_comments: defaults to False
|
||||||
|
:type include_comments: bool, optional
|
||||||
|
:param limit: Maximum number of pages to retrieve per request, defaults to 50
|
||||||
:type limit: int, optional
|
:type limit: int, optional
|
||||||
|
:param max_pages: Maximum number of pages to retrieve in total, defaults 1000
|
||||||
|
:type max_pages: int, optional
|
||||||
:raises ValueError: _description_
|
:raises ValueError: _description_
|
||||||
:raises ImportError: _description_
|
:raises ImportError: _description_
|
||||||
:return: _description_
|
:return: _description_
|
||||||
@ -191,10 +207,13 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
self.confluence.get_all_pages_from_space,
|
self.confluence.get_all_pages_from_space,
|
||||||
space=space_key,
|
space=space_key,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
|
max_pages=max_pages,
|
||||||
expand="body.storage.value",
|
expand="body.storage.value",
|
||||||
)
|
)
|
||||||
for page in pages:
|
for page in pages:
|
||||||
doc = self.process_page(page, include_attachments, text_maker)
|
doc = self.process_page(
|
||||||
|
page, include_attachments, include_comments, text_maker
|
||||||
|
)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
if label:
|
if label:
|
||||||
@ -202,18 +221,27 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
self.confluence.get_all_pages_by_label,
|
self.confluence.get_all_pages_by_label,
|
||||||
label=label,
|
label=label,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
|
max_pages=max_pages,
|
||||||
expand="body.storage.value",
|
expand="body.storage.value",
|
||||||
)
|
)
|
||||||
for page in pages:
|
for page in pages:
|
||||||
doc = self.process_page(page, include_attachments, text_maker)
|
doc = self.process_page(
|
||||||
|
page, include_attachments, include_comments, text_maker
|
||||||
|
)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
if cql:
|
if cql:
|
||||||
pages = self.paginate_request(
|
pages = self.paginate_request(
|
||||||
self.confluence.cql, cql=cql, limit=limit, expand="body.storage.value"
|
self.confluence.cql,
|
||||||
|
cql=cql,
|
||||||
|
limit=limit,
|
||||||
|
max_pages=max_pages,
|
||||||
|
expand="body.storage.value",
|
||||||
)
|
)
|
||||||
for page in pages:
|
for page in pages:
|
||||||
doc = self.process_page(page, include_attachments, text_maker)
|
doc = self.process_page(
|
||||||
|
page, include_attachments, include_comments, text_maker
|
||||||
|
)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
if page_ids:
|
if page_ids:
|
||||||
@ -231,7 +259,9 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||||
)(self.confluence.get_page_by_id)
|
)(self.confluence.get_page_by_id)
|
||||||
page = get_page(page_id=page_id, expand="body.storage.value")
|
page = get_page(page_id=page_id, expand="body.storage.value")
|
||||||
doc = self.process_page(page, include_attachments, text_maker)
|
doc = self.process_page(
|
||||||
|
page, include_attachments, include_comments, text_maker
|
||||||
|
)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
return docs
|
return docs
|
||||||
@ -239,11 +269,13 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
|
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
|
||||||
"""Paginate the various methods to retrieve groups of pages.
|
"""Paginate the various methods to retrieve groups of pages.
|
||||||
|
|
||||||
Unforunately, due to page size, sometimes the Confluence API
|
Unfortunately, due to page size, sometimes the Confluence API
|
||||||
doesn't match the limit value. Also, due to the Atlassian Python
|
doesn't match the limit value. If `limit` is >100 confluence
|
||||||
|
seems to cap the response to 100. Also, due to the Atlassian Python
|
||||||
package, we don't get the "next" values from the "_links" key because
|
package, we don't get the "next" values from the "_links" key because
|
||||||
they only return the value from the results key. So here, the pagination
|
they only return the value from the results key. So here, the pagination
|
||||||
starts from 0 and goes until the limit. We have to manually check if there
|
starts from 0 and goes until the max_pages, getting the `limit` number
|
||||||
|
of pages with each request. We have to manually check if there
|
||||||
are more docs based on the length of the returned list of pages, rather than
|
are more docs based on the length of the returned list of pages, rather than
|
||||||
just checking for the presence of a `next` key in the response like this page
|
just checking for the presence of a `next` key in the response like this page
|
||||||
would have you do:
|
would have you do:
|
||||||
@ -255,10 +287,9 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
:rtype: List
|
:rtype: List
|
||||||
"""
|
"""
|
||||||
|
|
||||||
limit = kwargs["limit"]
|
max_pages = kwargs.pop("max_pages")
|
||||||
page = 0
|
docs: List[dict] = []
|
||||||
docs = []
|
while len(docs) < max_pages:
|
||||||
while page < limit:
|
|
||||||
get_pages = retry(
|
get_pages = retry(
|
||||||
reraise=True,
|
reraise=True,
|
||||||
stop=stop_after_attempt(
|
stop=stop_after_attempt(
|
||||||
@ -271,16 +302,18 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
),
|
),
|
||||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||||
)(retrieval_method)
|
)(retrieval_method)
|
||||||
batch = get_pages(**kwargs, start=page)
|
batch = get_pages(**kwargs, start=len(docs))
|
||||||
if len(batch) < limit:
|
if not batch:
|
||||||
page = limit
|
break
|
||||||
else:
|
|
||||||
page += len(batch)
|
|
||||||
docs.extend(batch)
|
docs.extend(batch)
|
||||||
return docs
|
return docs[:max_pages]
|
||||||
|
|
||||||
def process_page(
|
def process_page(
|
||||||
self, page: dict, include_attachments: bool, text_maker: Any
|
self,
|
||||||
|
page: dict,
|
||||||
|
include_attachments: bool,
|
||||||
|
include_comments: bool,
|
||||||
|
text_maker: Any,
|
||||||
) -> Document:
|
) -> Document:
|
||||||
if include_attachments:
|
if include_attachments:
|
||||||
attachment_texts = self.process_attachment(page["id"])
|
attachment_texts = self.process_attachment(page["id"])
|
||||||
@ -289,8 +322,23 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
text = text_maker.handle(page["body"]["storage"]["value"]) + "".join(
|
text = text_maker.handle(page["body"]["storage"]["value"]) + "".join(
|
||||||
attachment_texts
|
attachment_texts
|
||||||
)
|
)
|
||||||
|
if include_comments:
|
||||||
|
comments = self.confluence.get_page_comments(
|
||||||
|
page["id"], expand="body.view.value", depth="all"
|
||||||
|
)["results"]
|
||||||
|
comment_texts = [
|
||||||
|
text_maker.handle(comment["body"]["view"]["value"])
|
||||||
|
for comment in comments
|
||||||
|
]
|
||||||
|
text = text + "".join(comment_texts)
|
||||||
|
|
||||||
return Document(
|
return Document(
|
||||||
page_content=text, metadata={"title": page["title"], "id": page["id"]}
|
page_content=text,
|
||||||
|
metadata={
|
||||||
|
"title": page["title"],
|
||||||
|
"id": page["id"],
|
||||||
|
"source": self.base_url.strip("/") + page["_links"]["webui"],
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
def process_attachment(self, page_id: str) -> List[str]:
|
def process_attachment(self, page_id: str) -> List[str]:
|
||||||
|
@ -19,6 +19,10 @@ def test_load_single_confluence_page() -> None:
|
|||||||
assert docs[0].page_content is not None
|
assert docs[0].page_content is not None
|
||||||
assert docs[0].metadata["id"] == "33189"
|
assert docs[0].metadata["id"] == "33189"
|
||||||
assert docs[0].metadata["title"] == "An easy intro to using Confluence"
|
assert docs[0].metadata["title"] == "An easy intro to using Confluence"
|
||||||
|
assert docs[0].metadata["source"] == (
|
||||||
|
"https://templates.atlassian.net/wiki/"
|
||||||
|
"spaces/RD/pages/33189/An+easy+intro+to+using+Confluence"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
||||||
@ -33,7 +37,18 @@ def test_load_full_confluence_space() -> None:
|
|||||||
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
||||||
def test_confluence_pagination() -> None:
|
def test_confluence_pagination() -> None:
|
||||||
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
|
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
|
||||||
docs = loader.load(space_key="RD", limit=5)
|
# this will issue 2 requests; each with a limit of 3 until the max_pages of 5 is met
|
||||||
|
docs = loader.load(space_key="RD", limit=3, max_pages=5)
|
||||||
|
|
||||||
assert len(docs) == 5
|
assert len(docs) == 5
|
||||||
assert docs[0].page_content is not None
|
assert docs[0].page_content is not None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
||||||
|
def test_pass_confluence_kwargs() -> None:
|
||||||
|
loader = ConfluenceLoader(
|
||||||
|
url="https://templates.atlassian.net/wiki/",
|
||||||
|
confluence_kwargs={"verify_ssl": False},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert loader.confluence.verify_ssl is False
|
||||||
|
Loading…
Reference in New Issue
Block a user