Fixed the error on ConfluenceLoader when content_format=VIEW and keep_markdown_format=True (#9633)

- Description: a description of the change

when I set `content_format=ContentFormat.VIEW` and
`keep_markdown_format=True` on ConfluenceLoader, it shows the following
error:
```
langchain/document_loaders/confluence.py", line 459, in process_page
    page["body"]["storage"]["value"], heading_style="ATX"
KeyError: 'storage'
```
The reason is because the content format was set to `view` but it was
still trying to get the content from `page["body"]["storage"]["value"]`.

Also added the other content formats which are supported by Atlassian
API

https://stackoverflow.com/questions/34353955/confluence-rest-api-expanding-page-body-when-retrieving-page-by-title/34363386#34363386

  - Issue: the issue # it fixes (if applicable),

Not applicable.

  - Dependencies: any dependencies required for this change,

Added optional dependency `markdownify` if anyone wants to extract in
markdown format.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Jun Liu 2023-08-23 14:00:15 +10:00 committed by GitHub
parent e1f4f9ac3e
commit b379c5f9c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 75 additions and 17 deletions

View File

@ -20,16 +20,14 @@ logger = logging.getLogger(__name__)
class ContentFormat(str, Enum): class ContentFormat(str, Enum):
"""Enumerator of the content formats of Confluence page.""" """Enumerator of the content formats of Confluence page."""
EDITOR = "body.editor"
EXPORT_VIEW = "body.export_view"
ANONYMOUS_EXPORT_VIEW = "body.anonymous_export_view"
STORAGE = "body.storage" STORAGE = "body.storage"
VIEW = "body.view" VIEW = "body.view"
def get_content(self, page: dict) -> str: def get_content(self, page: dict) -> str:
if self == ContentFormat.STORAGE: return page["body"][self.name.lower()]["value"]
return page["body"]["storage"]["value"]
elif self == ContentFormat.VIEW:
return page["body"]["view"]["value"]
raise ValueError("unknown content format")
class ConfluenceLoader(BaseLoader): class ConfluenceLoader(BaseLoader):
@ -52,7 +50,10 @@ class ConfluenceLoader(BaseLoader):
raw XML representation for storage. The view format is the HTML representation for raw XML representation for storage. The view format is the HTML representation for
viewing with macros are rendered as though it is viewed by users. You can pass viewing with macros are rendered as though it is viewed by users. You can pass
a enum `content_format` argument to `load()` to specify the content format, this is a enum `content_format` argument to `load()` to specify the content format, this is
set to `ContentFormat.STORAGE` by default. set to `ContentFormat.STORAGE` by default, the supported values are:
`ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`,
`ContentFormat.ANONYMOUS_EXPORT_VIEW`, `ContentFormat.STORAGE`,
and `ContentFormat.VIEW`.
Hint: space_key and page_id can both be found in the URL of a page in Confluence Hint: space_key and page_id can both be found in the URL of a page in Confluence
- https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id> - https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>
@ -238,7 +239,11 @@ class ConfluenceLoader(BaseLoader):
:type include_attachments: bool, optional :type include_attachments: bool, optional
:param include_comments: defaults to False :param include_comments: defaults to False
:type include_comments: bool, optional :type include_comments: bool, optional
:param content_format: Specify content format, defaults to ContentFormat.STORAGE :param content_format: Specify content format, defaults to
ContentFormat.STORAGE, the supported values are:
`ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`,
`ContentFormat.ANONYMOUS_EXPORT_VIEW`,
`ContentFormat.STORAGE`, and `ContentFormat.VIEW`.
:type content_format: ContentFormat :type content_format: ContentFormat
:param limit: Maximum number of pages to retrieve per request, defaults to 50 :param limit: Maximum number of pages to retrieve per request, defaults to 50
:type limit: int, optional :type limit: int, optional
@ -473,14 +478,12 @@ class ConfluenceLoader(BaseLoader):
else: else:
attachment_texts = [] attachment_texts = []
content = content_format.get_content(page)
if keep_markdown_format: if keep_markdown_format:
# Use markdownify to keep the page Markdown style # Use markdownify to keep the page Markdown style
text = markdownify( text = markdownify(content, heading_style="ATX") + "".join(attachment_texts)
page["body"]["storage"]["value"], heading_style="ATX"
) + "".join(attachment_texts)
else: else:
content = content_format.get_content(page)
if keep_newlines: if keep_newlines:
text = BeautifulSoup( text = BeautifulSoup(
content.replace("</p>", "\n</p>").replace("<br />", "\n"), "lxml" content.replace("</p>", "\n</p>").replace("<br />", "\n"), "lxml"

View File

@ -4288,6 +4288,21 @@ profiling = ["gprof2dot"]
rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
[[package]]
name = "markdownify"
version = "0.11.6"
description = "Convert HTML to markdown."
optional = true
python-versions = "*"
files = [
{file = "markdownify-0.11.6-py3-none-any.whl", hash = "sha256:ba35fe289d5e9073bcd7d2cad629278fe25f1a93741fcdc0bfb4f009076d8324"},
{file = "markdownify-0.11.6.tar.gz", hash = "sha256:009b240e0c9f4c8eaf1d085625dcd4011e12f0f8cec55dedf9ea6f7655e49bfe"},
]
[package.dependencies]
beautifulsoup4 = ">=4.9,<5"
six = ">=1.15,<2"
[[package]] [[package]]
name = "markupsafe" name = "markupsafe"
version = "2.1.3" version = "2.1.3"
@ -10323,7 +10338,7 @@ clarifai = ["clarifai"]
cohere = ["cohere"] cohere = ["cohere"]
docarray = ["docarray"] docarray = ["docarray"]
embeddings = ["sentence-transformers"] embeddings = ["sentence-transformers"]
extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"] extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"]
javascript = ["esprima"] javascript = ["esprima"]
llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
openai = ["openai", "tiktoken"] openai = ["openai", "tiktoken"]
@ -10333,4 +10348,4 @@ text-helpers = ["chardet"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<4.0" python-versions = ">=3.8.1,<4.0"
content-hash = "27c44e64d872c51f42b58f9f5185f20914dc4360e91860cfc260b1acbdaa3272" content-hash = "83280a03c352011c2a51081a29aca67bb5c4c23054ad1b7be94f89d6ce52460b"

View File

@ -126,6 +126,7 @@ amazon-textract-caller = {version = "<2", optional = true}
xata = {version = "^1.0.0a7", optional = true} xata = {version = "^1.0.0a7", optional = true}
xmltodict = {version = "^0.13.0", optional = true} xmltodict = {version = "^0.13.0", optional = true}
google-api-core = {version = "^2.11.1", optional = true} google-api-core = {version = "^2.11.1", optional = true}
markdownify = {version = "^0.11.6", optional = true}
[tool.poetry.group.test.dependencies] [tool.poetry.group.test.dependencies]
@ -338,6 +339,7 @@ extended_testing = [
"xmltodict", "xmltodict",
"faiss-cpu", "faiss-cpu",
"openapi-schema-pydantic", "openapi-schema-pydantic",
"markdownify",
] ]
[tool.ruff] [tool.ruff]

View File

@ -6,7 +6,7 @@ import pytest
import requests import requests
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.confluence import ConfluenceLoader from langchain.document_loaders.confluence import ConfluenceLoader, ContentFormat
@pytest.fixture @pytest.fixture
@ -152,6 +152,40 @@ class TestConfluenceLoader:
assert mock_confluence.cql.call_count == 0 assert mock_confluence.cql.call_count == 0
assert mock_confluence.get_page_child_by_type.call_count == 0 assert mock_confluence.get_page_child_by_type.call_count == 0
def test_confluence_loader_when_content_format_and_keep_markdown_format_enabled(
self, mock_confluence: MagicMock
) -> None:
# one response with two pages
mock_confluence.get_all_pages_from_space.return_value = [
self._get_mock_page("123", ContentFormat.VIEW),
self._get_mock_page("456", ContentFormat.VIEW),
]
mock_confluence.get_all_restrictions_for_content.side_effect = [
self._get_mock_page_restrictions("123"),
self._get_mock_page_restrictions("456"),
]
confluence_loader = self._get_mock_confluence_loader(mock_confluence)
documents = confluence_loader.load(
space_key=self.MOCK_SPACE_KEY,
content_format=ContentFormat.VIEW,
keep_markdown_format=True,
max_pages=2,
)
assert mock_confluence.get_all_pages_from_space.call_count == 1
assert len(documents) == 2
assert all(isinstance(doc, Document) for doc in documents)
assert documents[0].page_content == "Content 123\n\n"
assert documents[1].page_content == "Content 456\n\n"
assert mock_confluence.get_page_by_id.call_count == 0
assert mock_confluence.get_all_pages_by_label.call_count == 0
assert mock_confluence.cql.call_count == 0
assert mock_confluence.get_page_child_by_type.call_count == 0
def _get_mock_confluence_loader( def _get_mock_confluence_loader(
self, mock_confluence: MagicMock self, mock_confluence: MagicMock
) -> ConfluenceLoader: ) -> ConfluenceLoader:
@ -163,11 +197,15 @@ class TestConfluenceLoader:
confluence_loader.confluence = mock_confluence confluence_loader.confluence = mock_confluence
return confluence_loader return confluence_loader
def _get_mock_page(self, page_id: str) -> Dict: def _get_mock_page(
self, page_id: str, content_format: ContentFormat = ContentFormat.STORAGE
) -> Dict:
return { return {
"id": f"{page_id}", "id": f"{page_id}",
"title": f"Page {page_id}", "title": f"Page {page_id}",
"body": {"storage": {"value": f"<p>Content {page_id}</p>"}}, "body": {
f"{content_format.name.lower()}": {"value": f"<p>Content {page_id}</p>"}
},
"status": "current", "status": "current",
"type": "page", "type": "page",
"_links": { "_links": {