mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 12:18:24 +00:00
Fixed the error on ConfluenceLoader when content_format=VIEW and keep_markdown_format
=True (#9633)
- Description: a description of the change when I set `content_format=ContentFormat.VIEW` and `keep_markdown_format=True` on ConfluenceLoader, it shows the following error: ``` langchain/document_loaders/confluence.py", line 459, in process_page page["body"]["storage"]["value"], heading_style="ATX" KeyError: 'storage' ``` The reason is because the content format was set to `view` but it was still trying to get the content from `page["body"]["storage"]["value"]`. Also added the other content formats which are supported by Atlassian API https://stackoverflow.com/questions/34353955/confluence-rest-api-expanding-page-body-when-retrieving-page-by-title/34363386#34363386 - Issue: the issue # it fixes (if applicable), Not applicable. - Dependencies: any dependencies required for this change, Added optional dependency `markdownify` if anyone wants to extract in markdown format. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
e1f4f9ac3e
commit
b379c5f9c8
@ -20,16 +20,14 @@ logger = logging.getLogger(__name__)
|
||||
class ContentFormat(str, Enum):
|
||||
"""Enumerator of the content formats of Confluence page."""
|
||||
|
||||
EDITOR = "body.editor"
|
||||
EXPORT_VIEW = "body.export_view"
|
||||
ANONYMOUS_EXPORT_VIEW = "body.anonymous_export_view"
|
||||
STORAGE = "body.storage"
|
||||
VIEW = "body.view"
|
||||
|
||||
def get_content(self, page: dict) -> str:
|
||||
if self == ContentFormat.STORAGE:
|
||||
return page["body"]["storage"]["value"]
|
||||
elif self == ContentFormat.VIEW:
|
||||
return page["body"]["view"]["value"]
|
||||
|
||||
raise ValueError("unknown content format")
|
||||
return page["body"][self.name.lower()]["value"]
|
||||
|
||||
|
||||
class ConfluenceLoader(BaseLoader):
|
||||
@ -52,7 +50,10 @@ class ConfluenceLoader(BaseLoader):
|
||||
raw XML representation for storage. The view format is the HTML representation for
|
||||
viewing with macros are rendered as though it is viewed by users. You can pass
|
||||
a enum `content_format` argument to `load()` to specify the content format, this is
|
||||
set to `ContentFormat.STORAGE` by default.
|
||||
set to `ContentFormat.STORAGE` by default, the supported values are:
|
||||
`ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`,
|
||||
`ContentFormat.ANONYMOUS_EXPORT_VIEW`, `ContentFormat.STORAGE`,
|
||||
and `ContentFormat.VIEW`.
|
||||
|
||||
Hint: space_key and page_id can both be found in the URL of a page in Confluence
|
||||
- https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>
|
||||
@ -238,7 +239,11 @@ class ConfluenceLoader(BaseLoader):
|
||||
:type include_attachments: bool, optional
|
||||
:param include_comments: defaults to False
|
||||
:type include_comments: bool, optional
|
||||
:param content_format: Specify content format, defaults to ContentFormat.STORAGE
|
||||
:param content_format: Specify content format, defaults to
|
||||
ContentFormat.STORAGE, the supported values are:
|
||||
`ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`,
|
||||
`ContentFormat.ANONYMOUS_EXPORT_VIEW`,
|
||||
`ContentFormat.STORAGE`, and `ContentFormat.VIEW`.
|
||||
:type content_format: ContentFormat
|
||||
:param limit: Maximum number of pages to retrieve per request, defaults to 50
|
||||
:type limit: int, optional
|
||||
@ -473,14 +478,12 @@ class ConfluenceLoader(BaseLoader):
|
||||
else:
|
||||
attachment_texts = []
|
||||
|
||||
content = content_format.get_content(page)
|
||||
if keep_markdown_format:
|
||||
# Use markdownify to keep the page Markdown style
|
||||
text = markdownify(
|
||||
page["body"]["storage"]["value"], heading_style="ATX"
|
||||
) + "".join(attachment_texts)
|
||||
text = markdownify(content, heading_style="ATX") + "".join(attachment_texts)
|
||||
|
||||
else:
|
||||
content = content_format.get_content(page)
|
||||
if keep_newlines:
|
||||
text = BeautifulSoup(
|
||||
content.replace("</p>", "\n</p>").replace("<br />", "\n"), "lxml"
|
||||
|
19
libs/langchain/poetry.lock
generated
19
libs/langchain/poetry.lock
generated
@ -4288,6 +4288,21 @@ profiling = ["gprof2dot"]
|
||||
rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
|
||||
testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
|
||||
|
||||
[[package]]
|
||||
name = "markdownify"
|
||||
version = "0.11.6"
|
||||
description = "Convert HTML to markdown."
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "markdownify-0.11.6-py3-none-any.whl", hash = "sha256:ba35fe289d5e9073bcd7d2cad629278fe25f1a93741fcdc0bfb4f009076d8324"},
|
||||
{file = "markdownify-0.11.6.tar.gz", hash = "sha256:009b240e0c9f4c8eaf1d085625dcd4011e12f0f8cec55dedf9ea6f7655e49bfe"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
beautifulsoup4 = ">=4.9,<5"
|
||||
six = ">=1.15,<2"
|
||||
|
||||
[[package]]
|
||||
name = "markupsafe"
|
||||
version = "2.1.3"
|
||||
@ -10323,7 +10338,7 @@ clarifai = ["clarifai"]
|
||||
cohere = ["cohere"]
|
||||
docarray = ["docarray"]
|
||||
embeddings = ["sentence-transformers"]
|
||||
extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"]
|
||||
extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"]
|
||||
javascript = ["esprima"]
|
||||
llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
@ -10333,4 +10348,4 @@ text-helpers = ["chardet"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "27c44e64d872c51f42b58f9f5185f20914dc4360e91860cfc260b1acbdaa3272"
|
||||
content-hash = "83280a03c352011c2a51081a29aca67bb5c4c23054ad1b7be94f89d6ce52460b"
|
||||
|
@ -126,6 +126,7 @@ amazon-textract-caller = {version = "<2", optional = true}
|
||||
xata = {version = "^1.0.0a7", optional = true}
|
||||
xmltodict = {version = "^0.13.0", optional = true}
|
||||
google-api-core = {version = "^2.11.1", optional = true}
|
||||
markdownify = {version = "^0.11.6", optional = true}
|
||||
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
@ -338,6 +339,7 @@ extended_testing = [
|
||||
"xmltodict",
|
||||
"faiss-cpu",
|
||||
"openapi-schema-pydantic",
|
||||
"markdownify",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
|
@ -6,7 +6,7 @@ import pytest
|
||||
import requests
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.confluence import ConfluenceLoader
|
||||
from langchain.document_loaders.confluence import ConfluenceLoader, ContentFormat
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -152,6 +152,40 @@ class TestConfluenceLoader:
|
||||
assert mock_confluence.cql.call_count == 0
|
||||
assert mock_confluence.get_page_child_by_type.call_count == 0
|
||||
|
||||
def test_confluence_loader_when_content_format_and_keep_markdown_format_enabled(
|
||||
self, mock_confluence: MagicMock
|
||||
) -> None:
|
||||
# one response with two pages
|
||||
mock_confluence.get_all_pages_from_space.return_value = [
|
||||
self._get_mock_page("123", ContentFormat.VIEW),
|
||||
self._get_mock_page("456", ContentFormat.VIEW),
|
||||
]
|
||||
mock_confluence.get_all_restrictions_for_content.side_effect = [
|
||||
self._get_mock_page_restrictions("123"),
|
||||
self._get_mock_page_restrictions("456"),
|
||||
]
|
||||
|
||||
confluence_loader = self._get_mock_confluence_loader(mock_confluence)
|
||||
|
||||
documents = confluence_loader.load(
|
||||
space_key=self.MOCK_SPACE_KEY,
|
||||
content_format=ContentFormat.VIEW,
|
||||
keep_markdown_format=True,
|
||||
max_pages=2,
|
||||
)
|
||||
|
||||
assert mock_confluence.get_all_pages_from_space.call_count == 1
|
||||
|
||||
assert len(documents) == 2
|
||||
assert all(isinstance(doc, Document) for doc in documents)
|
||||
assert documents[0].page_content == "Content 123\n\n"
|
||||
assert documents[1].page_content == "Content 456\n\n"
|
||||
|
||||
assert mock_confluence.get_page_by_id.call_count == 0
|
||||
assert mock_confluence.get_all_pages_by_label.call_count == 0
|
||||
assert mock_confluence.cql.call_count == 0
|
||||
assert mock_confluence.get_page_child_by_type.call_count == 0
|
||||
|
||||
def _get_mock_confluence_loader(
|
||||
self, mock_confluence: MagicMock
|
||||
) -> ConfluenceLoader:
|
||||
@ -163,11 +197,15 @@ class TestConfluenceLoader:
|
||||
confluence_loader.confluence = mock_confluence
|
||||
return confluence_loader
|
||||
|
||||
def _get_mock_page(self, page_id: str) -> Dict:
|
||||
def _get_mock_page(
|
||||
self, page_id: str, content_format: ContentFormat = ContentFormat.STORAGE
|
||||
) -> Dict:
|
||||
return {
|
||||
"id": f"{page_id}",
|
||||
"title": f"Page {page_id}",
|
||||
"body": {"storage": {"value": f"<p>Content {page_id}</p>"}},
|
||||
"body": {
|
||||
f"{content_format.name.lower()}": {"value": f"<p>Content {page_id}</p>"}
|
||||
},
|
||||
"status": "current",
|
||||
"type": "page",
|
||||
"_links": {
|
||||
|
Loading…
Reference in New Issue
Block a user