mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-08 16:48:49 +00:00
### Description This pull request added new document loaders to load documents of various formats using [Dedoc](https://github.com/ispras/dedoc): - `DedocFileLoader` (determine file types automatically and parse) - `DedocPDFLoader` (for `PDF` and images parsing) - `DedocAPIFileLoader` (determine file types automatically and parse using Dedoc API without library installation) [Dedoc](https://dedoc.readthedocs.io) is an open-source library/service that extracts texts, tables, attached files and document structure (e.g., titles, list items, etc.) from files of various formats. The library is actively developed and maintained by a group of developers. `Dedoc` supports `DOCX`, `XLSX`, `PPTX`, `EML`, `HTML`, `PDF`, images and more. Full list of supported formats can be found [here](https://dedoc.readthedocs.io/en/latest/#id1). For `PDF` documents, `Dedoc` allows to determine textual layer correctness and split the document into paragraphs. ### Issue This pull request extends variety of document loaders supported by `langchain_community` allowing users to choose the most suitable option for raw documents parsing. ### Dependencies The PR added a new (optional) dependency `dedoc>=2.2.5` ([library documentation](https://dedoc.readthedocs.io)) to the `extended_testing_deps.txt` ### Twitter handle None ### Add tests and docs 1. Test for the integration: `libs/community/tests/integration_tests/document_loaders/test_dedoc.py` 2. Example notebook: `docs/docs/integrations/document_loaders/dedoc.ipynb` 3. Information about the library: `docs/docs/integrations/providers/dedoc.mdx` ### Lint and test Done locally: - `make format` - `make lint` - `make integration_tests` - `make docs_build` (from the project root) --------- Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
547 lines
20 KiB
Python
547 lines
20 KiB
Python
import html
|
|
import json
|
|
import os
|
|
from abc import ABC, abstractmethod
|
|
from typing import (
|
|
Dict,
|
|
Iterator,
|
|
Optional,
|
|
Tuple,
|
|
Union,
|
|
)
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
|
|
|
|
class DedocBaseLoader(BaseLoader, ABC):
|
|
"""
|
|
Base Loader that uses `dedoc` (https://dedoc.readthedocs.io).
|
|
|
|
Loader enables extracting text, tables and attached files from the given file:
|
|
* `Text` can be split by pages, `dedoc` tree nodes, textual lines
|
|
(according to the `split` parameter).
|
|
* `Attached files` (when with_attachments=True)
|
|
are split according to the `split` parameter.
|
|
For attachments, langchain Document object has an additional metadata field
|
|
`type`="attachment".
|
|
* `Tables` (when with_tables=True) are not split - each table corresponds to one
|
|
langchain Document object.
|
|
For tables, Document object has additional metadata fields `type`="table"
|
|
and `text_as_html` with table HTML representation.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
file_path: str,
|
|
*,
|
|
split: str = "document",
|
|
with_tables: bool = True,
|
|
with_attachments: Union[str, bool] = False,
|
|
recursion_deep_attachments: int = 10,
|
|
pdf_with_text_layer: str = "auto_tabby",
|
|
language: str = "rus+eng",
|
|
pages: str = ":",
|
|
is_one_column_document: str = "auto",
|
|
document_orientation: str = "auto",
|
|
need_header_footer_analysis: Union[str, bool] = False,
|
|
need_binarization: Union[str, bool] = False,
|
|
need_pdf_table_analysis: Union[str, bool] = True,
|
|
delimiter: Optional[str] = None,
|
|
encoding: Optional[str] = None,
|
|
) -> None:
|
|
"""
|
|
Initialize with file path and parsing parameters.
|
|
|
|
Args:
|
|
file_path: path to the file for processing
|
|
split: type of document splitting into parts (each part is returned
|
|
separately), default value "document"
|
|
"document": document text is returned as a single langchain Document
|
|
object (don't split)
|
|
"page": split document text into pages (works for PDF, DJVU, PPTX, PPT,
|
|
ODP)
|
|
"node": split document text into tree nodes (title nodes, list item
|
|
nodes, raw text nodes)
|
|
"line": split document text into lines
|
|
with_tables: add tables to the result - each table is returned as a single
|
|
langchain Document object
|
|
|
|
Parameters used for document parsing via `dedoc`
|
|
(https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):
|
|
|
|
with_attachments: enable attached files extraction
|
|
recursion_deep_attachments: recursion level for attached files
|
|
extraction, works only when with_attachments==True
|
|
pdf_with_text_layer: type of handler for parsing PDF documents,
|
|
available options
|
|
["true", "false", "tabby", "auto", "auto_tabby" (default)]
|
|
language: language of the document for PDF without a textual layer and
|
|
images, available options ["eng", "rus", "rus+eng" (default)],
|
|
the list of languages can be extended, please see
|
|
https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
|
|
pages: page slice to define the reading range for parsing PDF documents
|
|
is_one_column_document: detect number of columns for PDF without
|
|
a textual layer and images, available options
|
|
["true", "false", "auto" (default)]
|
|
document_orientation: fix document orientation (90, 180, 270 degrees)
|
|
for PDF without a textual layer and images, available options
|
|
["auto" (default), "no_change"]
|
|
need_header_footer_analysis: remove headers and footers from the output
|
|
result for parsing PDF and images
|
|
need_binarization: clean pages background (binarize) for PDF without a
|
|
textual layer and images
|
|
need_pdf_table_analysis: parse tables for PDF without a textual layer
|
|
and images
|
|
delimiter: column separator for CSV, TSV files
|
|
encoding: encoding of TXT, CSV, TSV
|
|
"""
|
|
self.parsing_parameters = {
|
|
key: value
|
|
for key, value in locals().items()
|
|
if key not in {"self", "file_path", "split", "with_tables"}
|
|
}
|
|
self.valid_split_values = {"document", "page", "node", "line"}
|
|
if split not in self.valid_split_values:
|
|
raise ValueError(
|
|
f"Got {split} for `split`, but should be one of "
|
|
f"`{self.valid_split_values}`"
|
|
)
|
|
self.split = split
|
|
self.with_tables = with_tables
|
|
self.file_path = file_path
|
|
|
|
structure_type = "tree" if self.split == "node" else "linear"
|
|
self.parsing_parameters["structure_type"] = structure_type
|
|
self.parsing_parameters["need_content_analysis"] = with_attachments
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""Lazily load documents."""
|
|
import tempfile
|
|
|
|
try:
|
|
from dedoc import DedocManager
|
|
except ImportError:
|
|
raise ImportError(
|
|
"`dedoc` package not found, please install it with `pip install dedoc`"
|
|
)
|
|
dedoc_manager = DedocManager(manager_config=self._make_config())
|
|
dedoc_manager.config["logger"].disabled = True
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
document_tree = dedoc_manager.parse(
|
|
file_path=self.file_path,
|
|
parameters={**self.parsing_parameters, "attachments_dir": tmpdir},
|
|
)
|
|
yield from self._split_document(
|
|
document_tree=document_tree.to_api_schema().dict(), split=self.split
|
|
)
|
|
|
|
@abstractmethod
|
|
def _make_config(self) -> dict:
|
|
"""
|
|
Make configuration for DedocManager according to the file extension and
|
|
parsing parameters.
|
|
"""
|
|
pass
|
|
|
|
def _json2txt(self, paragraph: dict) -> str:
|
|
"""Get text (recursively) of the document tree node."""
|
|
subparagraphs_text = "\n".join(
|
|
[
|
|
self._json2txt(subparagraph)
|
|
for subparagraph in paragraph["subparagraphs"]
|
|
]
|
|
)
|
|
text = (
|
|
f"{paragraph['text']}\n{subparagraphs_text}"
|
|
if subparagraphs_text
|
|
else paragraph["text"]
|
|
)
|
|
return text
|
|
|
|
def _parse_subparagraphs(
|
|
self, document_tree: dict, document_metadata: dict
|
|
) -> Iterator[Document]:
|
|
"""Parse recursively document tree obtained by `dedoc`."""
|
|
if len(document_tree["subparagraphs"]) > 0:
|
|
for subparagraph in document_tree["subparagraphs"]:
|
|
yield from self._parse_subparagraphs(
|
|
document_tree=subparagraph, document_metadata=document_metadata
|
|
)
|
|
else:
|
|
yield Document(
|
|
page_content=document_tree["text"],
|
|
metadata={**document_metadata, **document_tree["metadata"]},
|
|
)
|
|
|
|
def _split_document(
|
|
self,
|
|
document_tree: dict,
|
|
split: str,
|
|
additional_metadata: Optional[dict] = None,
|
|
) -> Iterator[Document]:
|
|
"""Split document into parts according to the `split` parameter."""
|
|
document_metadata = document_tree["metadata"]
|
|
if additional_metadata:
|
|
document_metadata = {**document_metadata, **additional_metadata}
|
|
|
|
if split == "document":
|
|
text = self._json2txt(paragraph=document_tree["content"]["structure"])
|
|
yield Document(page_content=text, metadata=document_metadata)
|
|
|
|
elif split == "page":
|
|
nodes = document_tree["content"]["structure"]["subparagraphs"]
|
|
page_id = nodes[0]["metadata"]["page_id"]
|
|
page_text = ""
|
|
|
|
for node in nodes:
|
|
if node["metadata"]["page_id"] == page_id:
|
|
page_text += self._json2txt(node)
|
|
else:
|
|
yield Document(
|
|
page_content=page_text,
|
|
metadata={**document_metadata, "page_id": page_id},
|
|
)
|
|
page_id = node["metadata"]["page_id"]
|
|
page_text = self._json2txt(node)
|
|
|
|
yield Document(
|
|
page_content=page_text,
|
|
metadata={**document_metadata, "page_id": page_id},
|
|
)
|
|
|
|
elif split == "line":
|
|
for node in document_tree["content"]["structure"]["subparagraphs"]:
|
|
line_metadata = node["metadata"]
|
|
yield Document(
|
|
page_content=self._json2txt(node),
|
|
metadata={**document_metadata, **line_metadata},
|
|
)
|
|
|
|
elif split == "node":
|
|
yield from self._parse_subparagraphs(
|
|
document_tree=document_tree["content"]["structure"],
|
|
document_metadata=document_metadata,
|
|
)
|
|
|
|
else:
|
|
raise ValueError(
|
|
f"Got {split} for `split`, but should be one of "
|
|
f"`{self.valid_split_values}`"
|
|
)
|
|
|
|
if self.with_tables:
|
|
for table in document_tree["content"]["tables"]:
|
|
table_text, table_html = self._get_table(table)
|
|
yield Document(
|
|
page_content=table_text,
|
|
metadata={
|
|
**table["metadata"],
|
|
"type": "table",
|
|
"text_as_html": table_html,
|
|
},
|
|
)
|
|
|
|
for attachment in document_tree["attachments"]:
|
|
yield from self._split_document(
|
|
document_tree=attachment,
|
|
split=self.split,
|
|
additional_metadata={"type": "attachment"},
|
|
)
|
|
|
|
def _get_table(self, table: dict) -> Tuple[str, str]:
|
|
"""Get text and HTML representation of the table."""
|
|
table_text = ""
|
|
for row in table["cells"]:
|
|
for cell in row:
|
|
table_text += " ".join(line["text"] for line in cell["lines"])
|
|
table_text += "\t"
|
|
table_text += "\n"
|
|
|
|
table_html = (
|
|
'<table border="1" style="border-collapse: collapse; width: 100%;'
|
|
'">\n<tbody>\n'
|
|
)
|
|
for row in table["cells"]:
|
|
table_html += "<tr>\n"
|
|
for cell in row:
|
|
cell_text = "\n".join(line["text"] for line in cell["lines"])
|
|
cell_text = html.escape(cell_text)
|
|
table_html += "<td"
|
|
if cell["invisible"]:
|
|
table_html += ' style="display: none" '
|
|
table_html += (
|
|
f' colspan="{cell["colspan"]}" rowspan='
|
|
f'"{cell["rowspan"]}">{cell_text}</td>\n'
|
|
)
|
|
table_html += "</tr>\n"
|
|
table_html += "</tbody>\n</table>"
|
|
|
|
return table_text, table_html
|
|
|
|
|
|
class DedocFileLoader(DedocBaseLoader):
|
|
"""
|
|
DedocFileLoader document loader integration to load files using `dedoc`.
|
|
|
|
The file loader automatically detects the file type (with the correct extension).
|
|
The list of supported file types is gives at
|
|
https://dedoc.readthedocs.io/en/latest/index.html#id1.
|
|
Please see the documentation of DedocBaseLoader to get more details.
|
|
|
|
Setup:
|
|
Install ``dedoc`` package.
|
|
|
|
.. code-block:: bash
|
|
|
|
pip install -U dedoc
|
|
|
|
Instantiate:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.document_loaders import DedocFileLoader
|
|
|
|
loader = DedocFileLoader(
|
|
file_path="example.pdf",
|
|
# split=...,
|
|
# with_tables=...,
|
|
# pdf_with_text_layer=...,
|
|
# pages=...,
|
|
# ...
|
|
)
|
|
|
|
Load:
|
|
.. code-block:: python
|
|
|
|
docs = loader.load()
|
|
print(docs[0].page_content[:100])
|
|
print(docs[0].metadata)
|
|
|
|
.. code-block:: python
|
|
|
|
Some text
|
|
{
|
|
'file_name': 'example.pdf',
|
|
'file_type': 'application/pdf',
|
|
# ...
|
|
}
|
|
|
|
Lazy load:
|
|
.. code-block:: python
|
|
|
|
docs = []
|
|
docs_lazy = loader.lazy_load()
|
|
|
|
for doc in docs_lazy:
|
|
docs.append(doc)
|
|
print(docs[0].page_content[:100])
|
|
print(docs[0].metadata)
|
|
|
|
.. code-block:: python
|
|
|
|
Some text
|
|
{
|
|
'file_name': 'example.pdf',
|
|
'file_type': 'application/pdf',
|
|
# ...
|
|
}
|
|
"""
|
|
|
|
def _make_config(self) -> dict:
|
|
from dedoc.utils.langchain import make_manager_config
|
|
|
|
return make_manager_config(
|
|
file_path=self.file_path,
|
|
parsing_params=self.parsing_parameters,
|
|
split=self.split,
|
|
)
|
|
|
|
|
|
class DedocAPIFileLoader(DedocBaseLoader):
|
|
"""
|
|
Load files using `dedoc` API.
|
|
The file loader automatically detects the file type (even with the wrong extension).
|
|
By default, the loader makes a call to the locally hosted `dedoc` API.
|
|
More information about `dedoc` API can be found in `dedoc` documentation:
|
|
https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html
|
|
|
|
Please see the documentation of DedocBaseLoader to get more details.
|
|
|
|
Setup:
|
|
You don't need to install `dedoc` library for using this loader.
|
|
Instead, the `dedoc` API needs to be run.
|
|
You may use Docker container for this purpose.
|
|
Please see `dedoc` documentation for more details:
|
|
https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker
|
|
|
|
.. code-block:: bash
|
|
|
|
docker pull dedocproject/dedoc
|
|
docker run -p 1231:1231
|
|
|
|
Instantiate:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.document_loaders import DedocAPIFileLoader
|
|
|
|
loader = DedocAPIFileLoader(
|
|
file_path="example.pdf",
|
|
# url=...,
|
|
# split=...,
|
|
# with_tables=...,
|
|
# pdf_with_text_layer=...,
|
|
# pages=...,
|
|
# ...
|
|
)
|
|
|
|
Load:
|
|
.. code-block:: python
|
|
|
|
docs = loader.load()
|
|
print(docs[0].page_content[:100])
|
|
print(docs[0].metadata)
|
|
|
|
.. code-block:: python
|
|
|
|
Some text
|
|
{
|
|
'file_name': 'example.pdf',
|
|
'file_type': 'application/pdf',
|
|
# ...
|
|
}
|
|
|
|
Lazy load:
|
|
.. code-block:: python
|
|
|
|
docs = []
|
|
docs_lazy = loader.lazy_load()
|
|
|
|
for doc in docs_lazy:
|
|
docs.append(doc)
|
|
print(docs[0].page_content[:100])
|
|
print(docs[0].metadata)
|
|
|
|
.. code-block:: python
|
|
|
|
Some text
|
|
{
|
|
'file_name': 'example.pdf',
|
|
'file_type': 'application/pdf',
|
|
# ...
|
|
}
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
file_path: str,
|
|
*,
|
|
url: str = "http://0.0.0.0:1231",
|
|
split: str = "document",
|
|
with_tables: bool = True,
|
|
with_attachments: Union[str, bool] = False,
|
|
recursion_deep_attachments: int = 10,
|
|
pdf_with_text_layer: str = "auto_tabby",
|
|
language: str = "rus+eng",
|
|
pages: str = ":",
|
|
is_one_column_document: str = "auto",
|
|
document_orientation: str = "auto",
|
|
need_header_footer_analysis: Union[str, bool] = False,
|
|
need_binarization: Union[str, bool] = False,
|
|
need_pdf_table_analysis: Union[str, bool] = True,
|
|
delimiter: Optional[str] = None,
|
|
encoding: Optional[str] = None,
|
|
) -> None:
|
|
"""Initialize with file path, API url and parsing parameters.
|
|
|
|
Args:
|
|
file_path: path to the file for processing
|
|
url: URL to call `dedoc` API
|
|
split: type of document splitting into parts (each part is returned
|
|
separately), default value "document"
|
|
"document": document is returned as a single langchain Document object
|
|
(don't split)
|
|
"page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP)
|
|
"node": split document into tree nodes (title nodes, list item nodes,
|
|
raw text nodes)
|
|
"line": split document into lines
|
|
with_tables: add tables to the result - each table is returned as a single
|
|
langchain Document object
|
|
|
|
Parameters used for document parsing via `dedoc`
|
|
(https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):
|
|
|
|
with_attachments: enable attached files extraction
|
|
recursion_deep_attachments: recursion level for attached files
|
|
extraction, works only when with_attachments==True
|
|
pdf_with_text_layer: type of handler for parsing PDF documents,
|
|
available options
|
|
["true", "false", "tabby", "auto", "auto_tabby" (default)]
|
|
language: language of the document for PDF without a textual layer and
|
|
images, available options ["eng", "rus", "rus+eng" (default)],
|
|
the list of languages can be extended, please see
|
|
https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
|
|
pages: page slice to define the reading range for parsing PDF documents
|
|
is_one_column_document: detect number of columns for PDF without
|
|
a textual layer and images, available options
|
|
["true", "false", "auto" (default)]
|
|
document_orientation: fix document orientation (90, 180, 270 degrees)
|
|
for PDF without a textual layer and images, available options
|
|
["auto" (default), "no_change"]
|
|
need_header_footer_analysis: remove headers and footers from the output
|
|
result for parsing PDF and images
|
|
need_binarization: clean pages background (binarize) for PDF without a
|
|
textual layer and images
|
|
need_pdf_table_analysis: parse tables for PDF without a textual layer
|
|
and images
|
|
delimiter: column separator for CSV, TSV files
|
|
encoding: encoding of TXT, CSV, TSV
|
|
"""
|
|
super().__init__(
|
|
file_path=file_path,
|
|
split=split,
|
|
with_tables=with_tables,
|
|
with_attachments=with_attachments,
|
|
recursion_deep_attachments=recursion_deep_attachments,
|
|
pdf_with_text_layer=pdf_with_text_layer,
|
|
language=language,
|
|
pages=pages,
|
|
is_one_column_document=is_one_column_document,
|
|
document_orientation=document_orientation,
|
|
need_header_footer_analysis=need_header_footer_analysis,
|
|
need_binarization=need_binarization,
|
|
need_pdf_table_analysis=need_pdf_table_analysis,
|
|
delimiter=delimiter,
|
|
encoding=encoding,
|
|
)
|
|
self.url = url
|
|
self.parsing_parameters["return_format"] = "json"
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""Lazily load documents."""
|
|
doc_tree = self._send_file(
|
|
url=self.url, file_path=self.file_path, parameters=self.parsing_parameters
|
|
)
|
|
yield from self._split_document(document_tree=doc_tree, split=self.split)
|
|
|
|
def _make_config(self) -> dict:
|
|
return {}
|
|
|
|
def _send_file(
|
|
self, url: str, file_path: str, parameters: dict
|
|
) -> Dict[str, Union[list, dict, str]]:
|
|
"""Send POST-request to `dedoc` API and return the results"""
|
|
import requests
|
|
|
|
file_name = os.path.basename(file_path)
|
|
with open(file_path, "rb") as file:
|
|
files = {"file": (file_name, file)}
|
|
r = requests.post(f"{url}/upload", files=files, data=parameters)
|
|
|
|
if r.status_code != 200:
|
|
raise ValueError(f"Error during file handling: {r.content.decode()}")
|
|
|
|
result = json.loads(r.content.decode())
|
|
return result
|