diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py index a1151c08e2c..739f52f48ed 100644 --- a/libs/langchain/langchain/document_loaders/confluence.py +++ b/libs/langchain/langchain/document_loaders/confluence.py @@ -20,16 +20,14 @@ logger = logging.getLogger(__name__) class ContentFormat(str, Enum): """Enumerator of the content formats of Confluence page.""" + EDITOR = "body.editor" + EXPORT_VIEW = "body.export_view" + ANONYMOUS_EXPORT_VIEW = "body.anonymous_export_view" STORAGE = "body.storage" VIEW = "body.view" def get_content(self, page: dict) -> str: - if self == ContentFormat.STORAGE: - return page["body"]["storage"]["value"] - elif self == ContentFormat.VIEW: - return page["body"]["view"]["value"] - - raise ValueError("unknown content format") + return page["body"][self.name.lower()]["value"] class ConfluenceLoader(BaseLoader): @@ -52,7 +50,10 @@ class ConfluenceLoader(BaseLoader): raw XML representation for storage. The view format is the HTML representation for viewing with macros are rendered as though it is viewed by users. You can pass a enum `content_format` argument to `load()` to specify the content format, this is - set to `ContentFormat.STORAGE` by default. + set to `ContentFormat.STORAGE` by default, the supported values are: + `ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`, + `ContentFormat.ANONYMOUS_EXPORT_VIEW`, `ContentFormat.STORAGE`, + and `ContentFormat.VIEW`. Hint: space_key and page_id can both be found in the URL of a page in Confluence - https://yoursite.atlassian.com/wiki/spaces//pages/ @@ -238,7 +239,11 @@ class ConfluenceLoader(BaseLoader): :type include_attachments: bool, optional :param include_comments: defaults to False :type include_comments: bool, optional - :param content_format: Specify content format, defaults to ContentFormat.STORAGE + :param content_format: Specify content format, defaults to + ContentFormat.STORAGE, the supported values are: + `ContentFormat.EDITOR`, `ContentFormat.EXPORT_VIEW`, + `ContentFormat.ANONYMOUS_EXPORT_VIEW`, + `ContentFormat.STORAGE`, and `ContentFormat.VIEW`. :type content_format: ContentFormat :param limit: Maximum number of pages to retrieve per request, defaults to 50 :type limit: int, optional @@ -473,14 +478,12 @@ class ConfluenceLoader(BaseLoader): else: attachment_texts = [] + content = content_format.get_content(page) if keep_markdown_format: # Use markdownify to keep the page Markdown style - text = markdownify( - page["body"]["storage"]["value"], heading_style="ATX" - ) + "".join(attachment_texts) + text = markdownify(content, heading_style="ATX") + "".join(attachment_texts) else: - content = content_format.get_content(page) if keep_newlines: text = BeautifulSoup( content.replace("

", "\n

").replace("
", "\n"), "lxml" diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock index f065fd0c38d..3ea9c47341f 100644 --- a/libs/langchain/poetry.lock +++ b/libs/langchain/poetry.lock @@ -4288,6 +4288,21 @@ profiling = ["gprof2dot"] rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] +[[package]] +name = "markdownify" +version = "0.11.6" +description = "Convert HTML to markdown." +optional = true +python-versions = "*" +files = [ + {file = "markdownify-0.11.6-py3-none-any.whl", hash = "sha256:ba35fe289d5e9073bcd7d2cad629278fe25f1a93741fcdc0bfb4f009076d8324"}, + {file = "markdownify-0.11.6.tar.gz", hash = "sha256:009b240e0c9f4c8eaf1d085625dcd4011e12f0f8cec55dedf9ea6f7655e49bfe"}, +] + +[package.dependencies] +beautifulsoup4 = ">=4.9,<5" +six = ">=1.15,<2" + [[package]] name = "markupsafe" version = "2.1.3" @@ -10323,7 +10338,7 @@ clarifai = ["clarifai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"] +extended-testing = ["amazon-textract-caller", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"] javascript = ["esprima"] llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] @@ -10333,4 +10348,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "27c44e64d872c51f42b58f9f5185f20914dc4360e91860cfc260b1acbdaa3272" +content-hash = "83280a03c352011c2a51081a29aca67bb5c4c23054ad1b7be94f89d6ce52460b" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index d0271234903..61f05175ddb 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -126,6 +126,7 @@ amazon-textract-caller = {version = "<2", optional = true} xata = {version = "^1.0.0a7", optional = true} xmltodict = {version = "^0.13.0", optional = true} google-api-core = {version = "^2.11.1", optional = true} +markdownify = {version = "^0.11.6", optional = true} [tool.poetry.group.test.dependencies] @@ -338,6 +339,7 @@ extended_testing = [ "xmltodict", "faiss-cpu", "openapi-schema-pydantic", + "markdownify", ] [tool.ruff] diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_confluence.py b/libs/langchain/tests/unit_tests/document_loaders/test_confluence.py index 42de78598a6..0048a8fba41 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_confluence.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_confluence.py @@ -6,7 +6,7 @@ import pytest import requests from langchain.docstore.document import Document -from langchain.document_loaders.confluence import ConfluenceLoader +from langchain.document_loaders.confluence import ConfluenceLoader, ContentFormat @pytest.fixture @@ -152,6 +152,40 @@ class TestConfluenceLoader: assert mock_confluence.cql.call_count == 0 assert mock_confluence.get_page_child_by_type.call_count == 0 + def test_confluence_loader_when_content_format_and_keep_markdown_format_enabled( + self, mock_confluence: MagicMock + ) -> None: + # one response with two pages + mock_confluence.get_all_pages_from_space.return_value = [ + self._get_mock_page("123", ContentFormat.VIEW), + self._get_mock_page("456", ContentFormat.VIEW), + ] + mock_confluence.get_all_restrictions_for_content.side_effect = [ + self._get_mock_page_restrictions("123"), + self._get_mock_page_restrictions("456"), + ] + + confluence_loader = self._get_mock_confluence_loader(mock_confluence) + + documents = confluence_loader.load( + space_key=self.MOCK_SPACE_KEY, + content_format=ContentFormat.VIEW, + keep_markdown_format=True, + max_pages=2, + ) + + assert mock_confluence.get_all_pages_from_space.call_count == 1 + + assert len(documents) == 2 + assert all(isinstance(doc, Document) for doc in documents) + assert documents[0].page_content == "Content 123\n\n" + assert documents[1].page_content == "Content 456\n\n" + + assert mock_confluence.get_page_by_id.call_count == 0 + assert mock_confluence.get_all_pages_by_label.call_count == 0 + assert mock_confluence.cql.call_count == 0 + assert mock_confluence.get_page_child_by_type.call_count == 0 + def _get_mock_confluence_loader( self, mock_confluence: MagicMock ) -> ConfluenceLoader: @@ -163,11 +197,15 @@ class TestConfluenceLoader: confluence_loader.confluence = mock_confluence return confluence_loader - def _get_mock_page(self, page_id: str) -> Dict: + def _get_mock_page( + self, page_id: str, content_format: ContentFormat = ContentFormat.STORAGE + ) -> Dict: return { "id": f"{page_id}", "title": f"Page {page_id}", - "body": {"storage": {"value": f"

Content {page_id}

"}}, + "body": { + f"{content_format.name.lower()}": {"value": f"

Content {page_id}

"} + }, "status": "current", "type": "page", "_links": {