community[minor]: added new document loaders based on dedoc library (#24303)

### Description This pull request added new document loaders to load documents of various formats using [Dedoc](https://github.com/ispras/dedoc): - `DedocFileLoader` (determine file types automatically and parse) - `DedocPDFLoader` (for `PDF` and images parsing) - `DedocAPIFileLoader` (determine file types automatically and parse using Dedoc API without library installation) [Dedoc](https://dedoc.readthedocs.io) is an open-source library/service that extracts texts, tables, attached files and document structure (e.g., titles, list items, etc.) from files of various formats. The library is actively developed and maintained by a group of developers. `Dedoc` supports `DOCX`, `XLSX`, `PPTX`, `EML`, `HTML`, `PDF`, images and more. Full list of supported formats can be found [here](https://dedoc.readthedocs.io/en/latest/#id1). For `PDF` documents, `Dedoc` allows to determine textual layer correctness and split the document into paragraphs. ### Issue This pull request extends variety of document loaders supported by `langchain_community` allowing users to choose the most suitable option for raw documents parsing. ### Dependencies The PR added a new (optional) dependency `dedoc>=2.2.5` ([library documentation](https://dedoc.readthedocs.io)) to the `extended_testing_deps.txt` ### Twitter handle None ### Add tests and docs 1. Test for the integration: `libs/community/tests/integration_tests/document_loaders/test_dedoc.py` 2. Example notebook: `docs/docs/integrations/document_loaders/dedoc.ipynb` 3. Information about the library: `docs/docs/integrations/providers/dedoc.mdx` ### Lint and test Done locally: - `make format` - `make lint` - `make integration_tests` - `make docs_build` (from the project root) --------- Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
2025-09-02 19:47:13 +00:00 · 2024-07-23 05:04:53 +03:00
parent 5ac936a284
commit 2a70a07aad
8 changed files with 1346 additions and 0 deletions
--- a/libs/community/extended_testing_deps.txt
+++ b/libs/community/extended_testing_deps.txt
@@ -16,6 +16,7 @@ cloudpickle>=2.0.0
 cohere>=4,<6
 databricks-vectorsearch>=0.21,<0.22
 datasets>=2.15.0,<3
+dedoc>=2.2.6,<3
 dgml-utils>=0.3.0,<0.4
 elasticsearch>=8.12.0,<9
 esprima>=4.0.1,<5
--- a/libs/community/langchain_community/document_loaders/init.py
+++ b/libs/community/langchain_community/document_loaders/init.py
@@ -142,6 +142,10 @@ if TYPE_CHECKING:
    from langchain_community.document_loaders.dataframe import (
        DataFrameLoader,
    )
+    from langchain_community.document_loaders.dedoc import (
+        DedocAPIFileLoader,
+        DedocFileLoader,
+    )
    from langchain_community.document_loaders.diffbot import (
        DiffbotLoader,
    )
@@ -340,6 +344,7 @@ if TYPE_CHECKING:
    )
    from langchain_community.document_loaders.pdf import (
        AmazonTextractPDFLoader,
+        DedocPDFLoader,
        MathpixPDFLoader,
        OnlinePDFLoader,
        PagedPDFSplitter,
@@ -570,6 +575,9 @@ _module_lookup = {
    "CubeSemanticLoader": "langchain_community.document_loaders.cube_semantic",
    "DataFrameLoader": "langchain_community.document_loaders.dataframe",
    "DatadogLogsLoader": "langchain_community.document_loaders.datadog_logs",
+    "DedocAPIFileLoader": "langchain_community.document_loaders.dedoc",
+    "DedocFileLoader": "langchain_community.document_loaders.dedoc",
+    "DedocPDFLoader": "langchain_community.document_loaders.pdf",
    "DiffbotLoader": "langchain_community.document_loaders.diffbot",
    "DirectoryLoader": "langchain_community.document_loaders.directory",
    "DiscordChatLoader": "langchain_community.document_loaders.discord",
@@ -771,6 +779,9 @@ __all__ = [
    "CubeSemanticLoader",
    "DataFrameLoader",
    "DatadogLogsLoader",
+    "DedocAPIFileLoader",
+    "DedocFileLoader",
+    "DedocPDFLoader",
    "DiffbotLoader",
    "DirectoryLoader",
    "DiscordChatLoader",
--- a/libs/community/langchain_community/document_loaders/dedoc.py
+++ b/libs/community/langchain_community/document_loaders/dedoc.py
@@ -0,0 +1,546 @@
+import html
+import json
+import os
+from abc import ABC, abstractmethod
+from typing import (
+    Dict,
+    Iterator,
+    Optional,
+    Tuple,
+    Union,
+)
+
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseLoader
+
+
+class DedocBaseLoader(BaseLoader, ABC):
+    """
+    Base Loader that uses `dedoc` (https://dedoc.readthedocs.io).
+
+    Loader enables extracting text, tables and attached files from the given file:
+        * `Text` can be split by pages, `dedoc` tree nodes, textual lines
+            (according to the `split` parameter).
+        * `Attached files` (when with_attachments=True)
+            are split according to the `split` parameter.
+            For attachments, langchain Document object has an additional metadata field
+            `type`="attachment".
+        * `Tables` (when with_tables=True) are not split - each table corresponds to one
+            langchain Document object.
+            For tables, Document object has additional metadata fields `type`="table"
+            and `text_as_html` with table HTML representation.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        *,
+        split: str = "document",
+        with_tables: bool = True,
+        with_attachments: Union[str, bool] = False,
+        recursion_deep_attachments: int = 10,
+        pdf_with_text_layer: str = "auto_tabby",
+        language: str = "rus+eng",
+        pages: str = ":",
+        is_one_column_document: str = "auto",
+        document_orientation: str = "auto",
+        need_header_footer_analysis: Union[str, bool] = False,
+        need_binarization: Union[str, bool] = False,
+        need_pdf_table_analysis: Union[str, bool] = True,
+        delimiter: Optional[str] = None,
+        encoding: Optional[str] = None,
+    ) -> None:
+        """
+        Initialize with file path and parsing parameters.
+
+        Args:
+            file_path: path to the file for processing
+            split: type of document splitting into parts (each part is returned
+                separately), default value "document"
+                "document": document text is returned as a single langchain Document
+                    object (don't split)
+                "page": split document text into pages (works for PDF, DJVU, PPTX, PPT,
+                    ODP)
+                "node": split document text into tree nodes (title nodes, list item
+                    nodes, raw text nodes)
+                "line": split document text into lines
+            with_tables: add tables to the result - each table is returned as a single
+                langchain Document object
+
+            Parameters used for document parsing via `dedoc`
+                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):
+
+                with_attachments: enable attached files extraction
+                recursion_deep_attachments: recursion level for attached files
+                    extraction, works only when with_attachments==True
+                pdf_with_text_layer: type of handler for parsing PDF documents,
+                    available options
+                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
+                language: language of the document for PDF without a textual layer and
+                    images, available options ["eng", "rus", "rus+eng" (default)],
+                    the list of languages can be extended, please see
+                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
+                pages: page slice to define the reading range for parsing PDF documents
+                is_one_column_document: detect number of columns for PDF without
+                    a textual layer and images, available options
+                    ["true", "false", "auto" (default)]
+                document_orientation: fix document orientation (90, 180, 270 degrees)
+                    for PDF without a textual layer and images, available options
+                    ["auto" (default), "no_change"]
+                need_header_footer_analysis: remove headers and footers from the output
+                    result for parsing PDF and images
+                need_binarization: clean pages background (binarize) for PDF without a
+                    textual layer and images
+                need_pdf_table_analysis: parse tables for PDF without a textual layer
+                    and images
+                delimiter: column separator for CSV, TSV files
+                encoding: encoding of TXT, CSV, TSV
+        """
+        self.parsing_parameters = {
+            key: value
+            for key, value in locals().items()
+            if key not in {"self", "file_path", "split", "with_tables"}
+        }
+        self.valid_split_values = {"document", "page", "node", "line"}
+        if split not in self.valid_split_values:
+            raise ValueError(
+                f"Got {split} for `split`, but should be one of "
+                f"`{self.valid_split_values}`"
+            )
+        self.split = split
+        self.with_tables = with_tables
+        self.file_path = file_path
+
+        structure_type = "tree" if self.split == "node" else "linear"
+        self.parsing_parameters["structure_type"] = structure_type
+        self.parsing_parameters["need_content_analysis"] = with_attachments
+
+    def lazy_load(self) -> Iterator[Document]:
+        """Lazily load documents."""
+        import tempfile
+
+        try:
+            from dedoc import DedocManager
+        except ImportError:
+            raise ImportError(
+                "`dedoc` package not found, please install it with `pip install dedoc`"
+            )
+        dedoc_manager = DedocManager(manager_config=self._make_config())
+        dedoc_manager.config["logger"].disabled = True
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            document_tree = dedoc_manager.parse(
+                file_path=self.file_path,
+                parameters={**self.parsing_parameters, "attachments_dir": tmpdir},
+            )
+        yield from self._split_document(
+            document_tree=document_tree.to_api_schema().dict(), split=self.split
+        )
+
+    @abstractmethod
+    def _make_config(self) -> dict:
+        """
+        Make configuration for DedocManager according to the file extension and
+        parsing parameters.
+        """
+        pass
+
+    def _json2txt(self, paragraph: dict) -> str:
+        """Get text (recursively) of the document tree node."""
+        subparagraphs_text = "\n".join(
+            [
+                self._json2txt(subparagraph)
+                for subparagraph in paragraph["subparagraphs"]
+            ]
+        )
+        text = (
+            f"{paragraph['text']}\n{subparagraphs_text}"
+            if subparagraphs_text
+            else paragraph["text"]
+        )
+        return text
+
+    def _parse_subparagraphs(
+        self, document_tree: dict, document_metadata: dict
+    ) -> Iterator[Document]:
+        """Parse recursively document tree obtained by `dedoc`."""
+        if len(document_tree["subparagraphs"]) > 0:
+            for subparagraph in document_tree["subparagraphs"]:
+                yield from self._parse_subparagraphs(
+                    document_tree=subparagraph, document_metadata=document_metadata
+                )
+        else:
+            yield Document(
+                page_content=document_tree["text"],
+                metadata={**document_metadata, **document_tree["metadata"]},
+            )
+
+    def _split_document(
+        self,
+        document_tree: dict,
+        split: str,
+        additional_metadata: Optional[dict] = None,
+    ) -> Iterator[Document]:
+        """Split document into parts according to the `split` parameter."""
+        document_metadata = document_tree["metadata"]
+        if additional_metadata:
+            document_metadata = {**document_metadata, **additional_metadata}
+
+        if split == "document":
+            text = self._json2txt(paragraph=document_tree["content"]["structure"])
+            yield Document(page_content=text, metadata=document_metadata)
+
+        elif split == "page":
+            nodes = document_tree["content"]["structure"]["subparagraphs"]
+            page_id = nodes[0]["metadata"]["page_id"]
+            page_text = ""
+
+            for node in nodes:
+                if node["metadata"]["page_id"] == page_id:
+                    page_text += self._json2txt(node)
+                else:
+                    yield Document(
+                        page_content=page_text,
+                        metadata={**document_metadata, "page_id": page_id},
+                    )
+                    page_id = node["metadata"]["page_id"]
+                    page_text = self._json2txt(node)
+
+            yield Document(
+                page_content=page_text,
+                metadata={**document_metadata, "page_id": page_id},
+            )
+
+        elif split == "line":
+            for node in document_tree["content"]["structure"]["subparagraphs"]:
+                line_metadata = node["metadata"]
+                yield Document(
+                    page_content=self._json2txt(node),
+                    metadata={**document_metadata, **line_metadata},
+                )
+
+        elif split == "node":
+            yield from self._parse_subparagraphs(
+                document_tree=document_tree["content"]["structure"],
+                document_metadata=document_metadata,
+            )
+
+        else:
+            raise ValueError(
+                f"Got {split} for `split`, but should be one of "
+                f"`{self.valid_split_values}`"
+            )
+
+        if self.with_tables:
+            for table in document_tree["content"]["tables"]:
+                table_text, table_html = self._get_table(table)
+                yield Document(
+                    page_content=table_text,
+                    metadata={
+                        **table["metadata"],
+                        "type": "table",
+                        "text_as_html": table_html,
+                    },
+                )
+
+        for attachment in document_tree["attachments"]:
+            yield from self._split_document(
+                document_tree=attachment,
+                split=self.split,
+                additional_metadata={"type": "attachment"},
+            )
+
+    def _get_table(self, table: dict) -> Tuple[str, str]:
+        """Get text and HTML representation of the table."""
+        table_text = ""
+        for row in table["cells"]:
+            for cell in row:
+                table_text += " ".join(line["text"] for line in cell["lines"])
+                table_text += "\t"
+            table_text += "\n"
+
+        table_html = (
+            '<table border="1" style="border-collapse: collapse; width: 100%;'
+            '">\n<tbody>\n'
+        )
+        for row in table["cells"]:
+            table_html += "<tr>\n"
+            for cell in row:
+                cell_text = "\n".join(line["text"] for line in cell["lines"])
+                cell_text = html.escape(cell_text)
+                table_html += "<td"
+                if cell["invisible"]:
+                    table_html += ' style="display: none" '
+                table_html += (
+                    f' colspan="{cell["colspan"]}" rowspan='
+                    f'"{cell["rowspan"]}">{cell_text}</td>\n'
+                )
+            table_html += "</tr>\n"
+        table_html += "</tbody>\n</table>"
+
+        return table_text, table_html
+
+
+class DedocFileLoader(DedocBaseLoader):
+    """
+    DedocFileLoader document loader integration to load files using `dedoc`.
+
+    The file loader automatically detects the file type (with the correct extension).
+    The list of supported file types is gives at
+    https://dedoc.readthedocs.io/en/latest/index.html#id1.
+    Please see the documentation of DedocBaseLoader to get more details.
+
+    Setup:
+        Install ``dedoc`` package.
+
+        .. code-block:: bash
+
+            pip install -U dedoc
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import DedocFileLoader
+
+            loader = DedocFileLoader(
+                file_path="example.pdf",
+                # split=...,
+                # with_tables=...,
+                # pdf_with_text_layer=...,
+                # pages=...,
+                # ...
+            )
+
+    Load:
+        .. code-block:: python
+
+            docs = loader.load()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Some text
+            {
+                'file_name': 'example.pdf',
+                'file_type': 'application/pdf',
+                # ...
+            }
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Some text
+            {
+                'file_name': 'example.pdf',
+                'file_type': 'application/pdf',
+                # ...
+            }
+    """
+
+    def _make_config(self) -> dict:
+        from dedoc.utils.langchain import make_manager_config
+
+        return make_manager_config(
+            file_path=self.file_path,
+            parsing_params=self.parsing_parameters,
+            split=self.split,
+        )
+
+
+class DedocAPIFileLoader(DedocBaseLoader):
+    """
+    Load files using `dedoc` API.
+    The file loader automatically detects the file type (even with the wrong extension).
+    By default, the loader makes a call to the locally hosted `dedoc` API.
+    More information about `dedoc` API can be found in `dedoc` documentation:
+        https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html
+
+    Please see the documentation of DedocBaseLoader to get more details.
+
+    Setup:
+        You don't need to install `dedoc` library for using this loader.
+        Instead, the `dedoc` API needs to be run.
+        You may use Docker container for this purpose.
+        Please see `dedoc` documentation for more details:
+            https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker
+
+        .. code-block:: bash
+
+            docker pull dedocproject/dedoc
+            docker run -p 1231:1231
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import DedocAPIFileLoader
+
+            loader = DedocAPIFileLoader(
+                file_path="example.pdf",
+                # url=...,
+                # split=...,
+                # with_tables=...,
+                # pdf_with_text_layer=...,
+                # pages=...,
+                # ...
+            )
+
+    Load:
+        .. code-block:: python
+
+            docs = loader.load()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Some text
+            {
+                'file_name': 'example.pdf',
+                'file_type': 'application/pdf',
+                # ...
+            }
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Some text
+            {
+                'file_name': 'example.pdf',
+                'file_type': 'application/pdf',
+                # ...
+            }
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        *,
+        url: str = "http://0.0.0.0:1231",
+        split: str = "document",
+        with_tables: bool = True,
+        with_attachments: Union[str, bool] = False,
+        recursion_deep_attachments: int = 10,
+        pdf_with_text_layer: str = "auto_tabby",
+        language: str = "rus+eng",
+        pages: str = ":",
+        is_one_column_document: str = "auto",
+        document_orientation: str = "auto",
+        need_header_footer_analysis: Union[str, bool] = False,
+        need_binarization: Union[str, bool] = False,
+        need_pdf_table_analysis: Union[str, bool] = True,
+        delimiter: Optional[str] = None,
+        encoding: Optional[str] = None,
+    ) -> None:
+        """Initialize with file path, API url and parsing parameters.
+
+        Args:
+            file_path: path to the file for processing
+            url: URL to call `dedoc` API
+            split: type of document splitting into parts (each part is returned
+                separately), default value "document"
+                "document": document is returned as a single langchain Document object
+                    (don't split)
+                "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP)
+                "node": split document into tree nodes (title nodes, list item nodes,
+                    raw text nodes)
+                "line": split document into lines
+            with_tables: add tables to the result - each table is returned as a single
+                langchain Document object
+
+            Parameters used for document parsing via `dedoc`
+                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):
+
+                with_attachments: enable attached files extraction
+                recursion_deep_attachments: recursion level for attached files
+                    extraction, works only when with_attachments==True
+                pdf_with_text_layer: type of handler for parsing PDF documents,
+                    available options
+                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
+                language: language of the document for PDF without a textual layer and
+                    images, available options ["eng", "rus", "rus+eng" (default)],
+                    the list of languages can be extended, please see
+                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
+                pages: page slice to define the reading range for parsing PDF documents
+                is_one_column_document: detect number of columns for PDF without
+                    a textual layer and images, available options
+                    ["true", "false", "auto" (default)]
+                document_orientation: fix document orientation (90, 180, 270 degrees)
+                    for PDF without a textual layer and images, available options
+                    ["auto" (default), "no_change"]
+                need_header_footer_analysis: remove headers and footers from the output
+                    result for parsing PDF and images
+                need_binarization: clean pages background (binarize) for PDF without a
+                    textual layer and images
+                need_pdf_table_analysis: parse tables for PDF without a textual layer
+                    and images
+                delimiter: column separator for CSV, TSV files
+                encoding: encoding of TXT, CSV, TSV
+        """
+        super().__init__(
+            file_path=file_path,
+            split=split,
+            with_tables=with_tables,
+            with_attachments=with_attachments,
+            recursion_deep_attachments=recursion_deep_attachments,
+            pdf_with_text_layer=pdf_with_text_layer,
+            language=language,
+            pages=pages,
+            is_one_column_document=is_one_column_document,
+            document_orientation=document_orientation,
+            need_header_footer_analysis=need_header_footer_analysis,
+            need_binarization=need_binarization,
+            need_pdf_table_analysis=need_pdf_table_analysis,
+            delimiter=delimiter,
+            encoding=encoding,
+        )
+        self.url = url
+        self.parsing_parameters["return_format"] = "json"
+
+    def lazy_load(self) -> Iterator[Document]:
+        """Lazily load documents."""
+        doc_tree = self._send_file(
+            url=self.url, file_path=self.file_path, parameters=self.parsing_parameters
+        )
+        yield from self._split_document(document_tree=doc_tree, split=self.split)
+
+    def _make_config(self) -> dict:
+        return {}
+
+    def _send_file(
+        self, url: str, file_path: str, parameters: dict
+    ) -> Dict[str, Union[list, dict, str]]:
+        """Send POST-request to `dedoc` API and return the results"""
+        import requests
+
+        file_name = os.path.basename(file_path)
+        with open(file_path, "rb") as file:
+            files = {"file": (file_name, file)}
+            r = requests.post(f"{url}/upload", files=files, data=parameters)
+
+        if r.status_code != 200:
+            raise ValueError(f"Error during file handling: {r.content.decode()}")
+
+        result = json.loads(r.content.decode())
+        return result
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@@ -26,6 +26,7 @@ from langchain_core.utils import get_from_dict_or_env

 from langchain_community.document_loaders.base import BaseLoader
 from langchain_community.document_loaders.blob_loaders import Blob
+from langchain_community.document_loaders.dedoc import DedocBaseLoader
 from langchain_community.document_loaders.parsers.pdf import (
    AmazonTextractPDFParser,
    DocumentIntelligenceParser,
@@ -738,6 +739,104 @@ class AmazonTextractPDFLoader(BasePDFLoader):
            raise ValueError(f"unsupported mime type: {blob.mimetype}")  # type: ignore[attr-defined]


+class DedocPDFLoader(DedocBaseLoader):
+    """
+    DedocPDFLoader document loader integration to load PDF files using `dedoc`.
+    The file loader can automatically detect the correctness of a textual layer in the
+        PDF document.
+    Note that `__init__` method supports parameters that differ from ones of
+        DedocBaseLoader.
+
+    Setup:
+        Install ``dedoc`` package.
+
+        .. code-block:: bash
+
+            pip install -U dedoc
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import DedocPDFLoader
+
+            loader = DedocPDFLoader(
+                file_path="example.pdf",
+                # split=...,
+                # with_tables=...,
+                # pdf_with_text_layer=...,
+                # pages=...,
+                # ...
+            )
+
+    Load:
+        .. code-block:: python
+
+            docs = loader.load()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Some text
+            {
+                'file_name': 'example.pdf',
+                'file_type': 'application/pdf',
+                # ...
+            }
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Some text
+            {
+                'file_name': 'example.pdf',
+                'file_type': 'application/pdf',
+                # ...
+            }
+
+    Parameters used for document parsing via `dedoc`
+        (https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html):
+
+        with_attachments: enable attached files extraction
+        recursion_deep_attachments: recursion level for attached files extraction,
+            works only when with_attachments==True
+        pdf_with_text_layer: type of handler for parsing, available options
+            ["true", "false", "tabby", "auto", "auto_tabby" (default)]
+        language: language of the document for PDF without a textual layer,
+            available options ["eng", "rus", "rus+eng" (default)], the list of
+            languages can be extended, please see
+            https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
+        pages: page slice to define the reading range for parsing
+        is_one_column_document: detect number of columns for PDF without a textual
+            layer, available options ["true", "false", "auto" (default)]
+        document_orientation: fix document orientation (90, 180, 270 degrees) for PDF
+            without a textual layer, available options ["auto" (default), "no_change"]
+        need_header_footer_analysis: remove headers and footers from the output result
+        need_binarization: clean pages background (binarize) for PDF without a textual
+            layer
+        need_pdf_table_analysis: parse tables for PDF without a textual layer
+    """
+
+    def _make_config(self) -> dict:
+        from dedoc.utils.langchain import make_manager_pdf_config
+
+        return make_manager_pdf_config(
+            file_path=self.file_path,
+            parsing_params=self.parsing_parameters,
+            split=self.split,
+        )
+
+
 class DocumentIntelligenceLoader(BasePDFLoader):
    """Load a PDF with Azure Document Intelligence"""

--- a/libs/community/tests/integration_tests/document_loaders/test_dedoc.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_dedoc.py
@@ -0,0 +1,146 @@
+import os
+from pathlib import Path
+
+from langchain_community.document_loaders import (
+    DedocAPIFileLoader,
+    DedocFileLoader,
+    DedocPDFLoader,
+)
+
+EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
+
+FILE_NAMES = [
+    "example.html",
+    "example.json",
+    "fake-email-attachment.eml",
+    "layout-parser-paper.pdf",
+    "slack_export.zip",
+    "stanley-cups.csv",
+    "stanley-cups.xlsx",
+    "whatsapp_chat.txt",
+]
+
+
+def test_dedoc_file_loader() -> None:
+    for file_name in FILE_NAMES:
+        file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
+        loader = DedocFileLoader(
+            file_path,
+            split="document",
+            with_tables=False,
+            pdf_with_text_layer="tabby",
+            pages=":1",
+        )
+        docs = loader.load()
+
+        assert len(docs) == 1
+
+
+def test_dedoc_pdf_loader() -> None:
+    file_name = "layout-parser-paper.pdf"
+    for mode in ("true", "tabby"):
+        file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
+        loader = DedocPDFLoader(
+            file_path,
+            split="document",
+            with_tables=False,
+            pdf_with_text_layer=mode,
+            pages=":1",
+        )
+        docs = loader.load()
+
+        assert len(docs) == 1
+
+
+def test_dedoc_content_html() -> None:
+    file_name = "example.html"
+    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
+    loader = DedocFileLoader(
+        file_path,
+        split="line",
+        with_tables=False,
+    )
+    docs = loader.load()
+
+    assert docs[0].metadata["file_name"] == "example.html"
+    assert docs[0].metadata["file_type"] == "text/html"
+    assert "Instead of drinking water from the cat bowl" in docs[0].page_content
+    assert "Chase the red dot" not in docs[0].page_content
+
+
+def test_dedoc_content_pdf() -> None:
+    file_name = "layout-parser-paper.pdf"
+    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
+    loader = DedocFileLoader(
+        file_path, split="page", pdf_with_text_layer="tabby", pages=":5"
+    )
+    docs = loader.load()
+    table_list = [item for item in docs if item.metadata.get("type", "") == "table"]
+
+    assert len(docs) == 6
+    assert docs[0].metadata["file_name"] == "layout-parser-paper.pdf"
+    assert docs[0].metadata["file_type"] == "application/pdf"
+    assert "This paper introduces LayoutParser, an open-source" in docs[0].page_content
+    assert "layout detection [38, 22], table detection [26]" in docs[1].page_content
+    assert "LayoutParser: A Uniﬁed Toolkit for DL-Based DIA" in docs[2].page_content
+    assert len(table_list) > 0
+    assert (
+        '\n<tbody>\n<tr>\n<td colspan="1" rowspan="1">'
+        in table_list[0].metadata["text_as_html"]
+    )
+
+
+def test_dedoc_content_json() -> None:
+    file_name = "example.json"
+    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
+    loader = DedocFileLoader(file_path, split="node")
+    docs = loader.load()
+
+    assert len(docs) == 11
+    assert docs[0].metadata["file_name"] == "example.json"
+    assert docs[0].metadata["file_type"] == "application/json"
+    assert "Bye!" in docs[0].page_content
+
+
+def test_dedoc_content_txt() -> None:
+    file_name = "whatsapp_chat.txt"
+    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
+    loader = DedocFileLoader(file_path, split="line")
+    docs = loader.load()
+
+    assert len(docs) == 10
+    assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"
+    assert docs[0].metadata["file_type"] == "text/plain"
+    assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content
+    assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content
+    assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content
+
+
+def test_dedoc_table_handling() -> None:
+    file_name = "stanley-cups.csv"
+    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
+    loader = DedocFileLoader(file_path, split="document")
+    docs = loader.load()
+
+    assert len(docs) == 2
+    assert docs[0].metadata["file_name"] == "stanley-cups.csv"
+    assert docs[0].metadata["file_type"] == "text/csv"
+    assert docs[1].metadata["type"] == "table"
+    assert '<td colspan="1" rowspan="1">1</td>' in docs[1].metadata["text_as_html"]
+    assert "Maple Leafs\tTOR\t13" in docs[1].page_content
+
+
+def test_dedoc_api_file_loader() -> None:
+    file_name = "whatsapp_chat.txt"
+    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
+    loader = DedocAPIFileLoader(
+        file_path, split="line", url="https://dedoc-readme.hf.space"
+    )
+    docs = loader.load()
+
+    assert len(docs) == 10
+    assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"
+    assert docs[0].metadata["file_type"] == "text/plain"
+    assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content
+    assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content
+    assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content
--- a/libs/community/tests/unit_tests/document_loaders/test_imports.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py
@@ -51,6 +51,9 @@ EXPECTED_ALL = [
    "CubeSemanticLoader",
    "DataFrameLoader",
    "DatadogLogsLoader",
+    "DedocAPIFileLoader",
+    "DedocFileLoader",
+    "DedocPDFLoader",
    "PebbloSafeLoader",
    "DiffbotLoader",
    "DirectoryLoader",