Several confluence loader improvements (#3300)

This PR addresses several improvements:

- Previously it was not possible to load spaces of more than 100 pages.
The `limit` was being used both as an overall page limit *and* as a per
request pagination limit. This, in combination with the fact that
atlassian seem to use a server-side hard limit of 100 when page content
is expanded, meant it wasn't possible to download >100 pages. Now
`limit` is used *only* as a per-request pagination limit and `max_pages`
is introduced as the way to limit the total number of pages returned by
the paginator.
- Document metadata now includes `source` (the source url), making it
compatible with `RetrievalQAWithSourcesChain`.
 - It is now possible to include inline and footer comments.
- It is now possible to pass `verify_ssl=False` and other parameters to
the confluence object for use cases that require it.
This commit is contained in:
Luke Harris
2023-04-23 23:06:10 +01:00
committed by GitHub
parent 651cb62556
commit b4de839ed8
2 changed files with 87 additions and 24 deletions

View File

@@ -19,6 +19,10 @@ def test_load_single_confluence_page() -> None:
assert docs[0].page_content is not None
assert docs[0].metadata["id"] == "33189"
assert docs[0].metadata["title"] == "An easy intro to using Confluence"
assert docs[0].metadata["source"] == (
"https://templates.atlassian.net/wiki/"
"spaces/RD/pages/33189/An+easy+intro+to+using+Confluence"
)
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
@@ -33,7 +37,18 @@ def test_load_full_confluence_space() -> None:
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
def test_confluence_pagination() -> None:
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
docs = loader.load(space_key="RD", limit=5)
# this will issue 2 requests; each with a limit of 3 until the max_pages of 5 is met
docs = loader.load(space_key="RD", limit=3, max_pages=5)
assert len(docs) == 5
assert docs[0].page_content is not None
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
def test_pass_confluence_kwargs() -> None:
loader = ConfluenceLoader(
url="https://templates.atlassian.net/wiki/",
confluence_kwargs={"verify_ssl": False},
)
assert loader.confluence.verify_ssl is False