mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-17 00:17:47 +00:00
Harrison/unstructured page number (#6464)
Co-authored-by: Reza Sanaie <reza@sanaie.ca>
This commit is contained in:
parent
b82ddf9cfb
commit
9eec7c3206
@ -226,13 +226,17 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "8de9ef16",
|
"id": "8de9ef16",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## PDF Example\n",
|
"## PDF Example\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of `elements`. "
|
"Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of elements. Modes of operation are \n",
|
||||||
|
"- `single` all the text from all elements are combined into one (default)\n",
|
||||||
|
"- `elements` maintain individual elements\n",
|
||||||
|
"- `paged` texts from each page are only combined"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
"""Loader that uses unstructured to load files."""
|
"""Loader that uses unstructured to load files."""
|
||||||
import collections
|
import collections
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import IO, Any, List, Sequence, Union
|
from typing import IO, Any, Dict, List, Sequence, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
@ -45,7 +45,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
|||||||
"unstructured package not found, please install it with "
|
"unstructured package not found, please install it with "
|
||||||
"`pip install unstructured`"
|
"`pip install unstructured`"
|
||||||
)
|
)
|
||||||
_valid_modes = {"single", "elements"}
|
_valid_modes = {"single", "elements", "paged"}
|
||||||
if mode not in _valid_modes:
|
if mode not in _valid_modes:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
|
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
|
||||||
@ -80,6 +80,31 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
|||||||
if hasattr(element, "category"):
|
if hasattr(element, "category"):
|
||||||
metadata["category"] = element.category
|
metadata["category"] = element.category
|
||||||
docs.append(Document(page_content=str(element), metadata=metadata))
|
docs.append(Document(page_content=str(element), metadata=metadata))
|
||||||
|
elif self.mode == "paged":
|
||||||
|
text_dict: Dict[int, str] = {}
|
||||||
|
meta_dict: Dict[int, Dict] = {}
|
||||||
|
|
||||||
|
for idx, element in enumerate(elements):
|
||||||
|
metadata = self._get_metadata()
|
||||||
|
if hasattr(element, "metadata"):
|
||||||
|
metadata.update(element.metadata.to_dict())
|
||||||
|
page_number = metadata.get("page_number", 1)
|
||||||
|
|
||||||
|
# Check if this page_number already exists in docs_dict
|
||||||
|
if page_number not in text_dict:
|
||||||
|
# If not, create new entry with initial text and metadata
|
||||||
|
text_dict[page_number] = str(element) + "\n\n"
|
||||||
|
meta_dict[page_number] = metadata
|
||||||
|
else:
|
||||||
|
# If exists, append to text and update the metadata
|
||||||
|
text_dict[page_number] += str(element) + "\n\n"
|
||||||
|
meta_dict[page_number].update(metadata)
|
||||||
|
|
||||||
|
# Convert the dict to a list of Document objects
|
||||||
|
docs = [
|
||||||
|
Document(page_content=text_dict[key], metadata=meta_dict[key])
|
||||||
|
for key in text_dict.keys()
|
||||||
|
]
|
||||||
elif self.mode == "single":
|
elif self.mode == "single":
|
||||||
metadata = self._get_metadata()
|
metadata = self._get_metadata()
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
|
@ -11,7 +11,25 @@ from langchain.document_loaders import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_unstructured_pdf_loader() -> None:
|
def test_unstructured_pdf_loader_elements_mode() -> None:
|
||||||
|
"""Test unstructured loader with various modes."""
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
|
loader = UnstructuredPDFLoader(str(file_path), mode="elements")
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_unstructured_pdf_loader_paged_mode() -> None:
|
||||||
|
"""Test unstructured loader with various modes."""
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||||
|
loader = UnstructuredPDFLoader(str(file_path), mode="paged")
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 16
|
||||||
|
|
||||||
|
|
||||||
|
def test_unstructured_pdf_loader_default_mode() -> None:
|
||||||
"""Test unstructured loader."""
|
"""Test unstructured loader."""
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
loader = UnstructuredPDFLoader(str(file_path))
|
loader = UnstructuredPDFLoader(str(file_path))
|
||||||
|
Loading…
Reference in New Issue
Block a user