mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-19 14:01:50 +00:00
### Description This pull request added new document loaders to load documents of various formats using [Dedoc](https://github.com/ispras/dedoc): - `DedocFileLoader` (determine file types automatically and parse) - `DedocPDFLoader` (for `PDF` and images parsing) - `DedocAPIFileLoader` (determine file types automatically and parse using Dedoc API without library installation) [Dedoc](https://dedoc.readthedocs.io) is an open-source library/service that extracts texts, tables, attached files and document structure (e.g., titles, list items, etc.) from files of various formats. The library is actively developed and maintained by a group of developers. `Dedoc` supports `DOCX`, `XLSX`, `PPTX`, `EML`, `HTML`, `PDF`, images and more. Full list of supported formats can be found [here](https://dedoc.readthedocs.io/en/latest/#id1). For `PDF` documents, `Dedoc` allows to determine textual layer correctness and split the document into paragraphs. ### Issue This pull request extends variety of document loaders supported by `langchain_community` allowing users to choose the most suitable option for raw documents parsing. ### Dependencies The PR added a new (optional) dependency `dedoc>=2.2.5` ([library documentation](https://dedoc.readthedocs.io)) to the `extended_testing_deps.txt` ### Twitter handle None ### Add tests and docs 1. Test for the integration: `libs/community/tests/integration_tests/document_loaders/test_dedoc.py` 2. Example notebook: `docs/docs/integrations/document_loaders/dedoc.ipynb` 3. Information about the library: `docs/docs/integrations/providers/dedoc.mdx` ### Lint and test Done locally: - `make format` - `make lint` - `make integration_tests` - `make docs_build` (from the project root) --------- Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
147 lines
4.8 KiB
Python
147 lines
4.8 KiB
Python
import os
|
|
from pathlib import Path
|
|
|
|
from langchain_community.document_loaders import (
|
|
DedocAPIFileLoader,
|
|
DedocFileLoader,
|
|
DedocPDFLoader,
|
|
)
|
|
|
|
EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
|
|
|
|
FILE_NAMES = [
|
|
"example.html",
|
|
"example.json",
|
|
"fake-email-attachment.eml",
|
|
"layout-parser-paper.pdf",
|
|
"slack_export.zip",
|
|
"stanley-cups.csv",
|
|
"stanley-cups.xlsx",
|
|
"whatsapp_chat.txt",
|
|
]
|
|
|
|
|
|
def test_dedoc_file_loader() -> None:
|
|
for file_name in FILE_NAMES:
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
|
loader = DedocFileLoader(
|
|
file_path,
|
|
split="document",
|
|
with_tables=False,
|
|
pdf_with_text_layer="tabby",
|
|
pages=":1",
|
|
)
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
|
def test_dedoc_pdf_loader() -> None:
|
|
file_name = "layout-parser-paper.pdf"
|
|
for mode in ("true", "tabby"):
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
|
loader = DedocPDFLoader(
|
|
file_path,
|
|
split="document",
|
|
with_tables=False,
|
|
pdf_with_text_layer=mode,
|
|
pages=":1",
|
|
)
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 1
|
|
|
|
|
|
def test_dedoc_content_html() -> None:
|
|
file_name = "example.html"
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
|
loader = DedocFileLoader(
|
|
file_path,
|
|
split="line",
|
|
with_tables=False,
|
|
)
|
|
docs = loader.load()
|
|
|
|
assert docs[0].metadata["file_name"] == "example.html"
|
|
assert docs[0].metadata["file_type"] == "text/html"
|
|
assert "Instead of drinking water from the cat bowl" in docs[0].page_content
|
|
assert "Chase the red dot" not in docs[0].page_content
|
|
|
|
|
|
def test_dedoc_content_pdf() -> None:
|
|
file_name = "layout-parser-paper.pdf"
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
|
loader = DedocFileLoader(
|
|
file_path, split="page", pdf_with_text_layer="tabby", pages=":5"
|
|
)
|
|
docs = loader.load()
|
|
table_list = [item for item in docs if item.metadata.get("type", "") == "table"]
|
|
|
|
assert len(docs) == 6
|
|
assert docs[0].metadata["file_name"] == "layout-parser-paper.pdf"
|
|
assert docs[0].metadata["file_type"] == "application/pdf"
|
|
assert "This paper introduces LayoutParser, an open-source" in docs[0].page_content
|
|
assert "layout detection [38, 22], table detection [26]" in docs[1].page_content
|
|
assert "LayoutParser: A Unified Toolkit for DL-Based DIA" in docs[2].page_content
|
|
assert len(table_list) > 0
|
|
assert (
|
|
'\n<tbody>\n<tr>\n<td colspan="1" rowspan="1">'
|
|
in table_list[0].metadata["text_as_html"]
|
|
)
|
|
|
|
|
|
def test_dedoc_content_json() -> None:
|
|
file_name = "example.json"
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
|
loader = DedocFileLoader(file_path, split="node")
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 11
|
|
assert docs[0].metadata["file_name"] == "example.json"
|
|
assert docs[0].metadata["file_type"] == "application/json"
|
|
assert "Bye!" in docs[0].page_content
|
|
|
|
|
|
def test_dedoc_content_txt() -> None:
|
|
file_name = "whatsapp_chat.txt"
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
|
loader = DedocFileLoader(file_path, split="line")
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 10
|
|
assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"
|
|
assert docs[0].metadata["file_type"] == "text/plain"
|
|
assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content
|
|
assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content
|
|
assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content
|
|
|
|
|
|
def test_dedoc_table_handling() -> None:
|
|
file_name = "stanley-cups.csv"
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
|
loader = DedocFileLoader(file_path, split="document")
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 2
|
|
assert docs[0].metadata["file_name"] == "stanley-cups.csv"
|
|
assert docs[0].metadata["file_type"] == "text/csv"
|
|
assert docs[1].metadata["type"] == "table"
|
|
assert '<td colspan="1" rowspan="1">1</td>' in docs[1].metadata["text_as_html"]
|
|
assert "Maple Leafs\tTOR\t13" in docs[1].page_content
|
|
|
|
|
|
def test_dedoc_api_file_loader() -> None:
|
|
file_name = "whatsapp_chat.txt"
|
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
|
loader = DedocAPIFileLoader(
|
|
file_path, split="line", url="https://dedoc-readme.hf.space"
|
|
)
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 10
|
|
assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"
|
|
assert docs[0].metadata["file_type"] == "text/plain"
|
|
assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content
|
|
assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content
|
|
assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content
|