mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-16 08:06:14 +00:00
Harrison/unstructured page number (#6464)
Co-authored-by: Reza Sanaie <reza@sanaie.ca>
This commit is contained in:
parent
b82ddf9cfb
commit
9eec7c3206
@ -226,13 +226,17 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "8de9ef16",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## PDF Example\n",
|
||||
"\n",
|
||||
"Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of `elements`. "
|
||||
"Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of elements. Modes of operation are \n",
|
||||
"- `single` all the text from all elements are combined into one (default)\n",
|
||||
"- `elements` maintain individual elements\n",
|
||||
"- `paged` texts from each page are only combined"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""Loader that uses unstructured to load files."""
|
||||
import collections
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import IO, Any, List, Sequence, Union
|
||||
from typing import IO, Any, Dict, List, Sequence, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
@ -45,7 +45,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
_valid_modes = {"single", "elements"}
|
||||
_valid_modes = {"single", "elements", "paged"}
|
||||
if mode not in _valid_modes:
|
||||
raise ValueError(
|
||||
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
|
||||
@ -80,6 +80,31 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
if hasattr(element, "category"):
|
||||
metadata["category"] = element.category
|
||||
docs.append(Document(page_content=str(element), metadata=metadata))
|
||||
elif self.mode == "paged":
|
||||
text_dict: Dict[int, str] = {}
|
||||
meta_dict: Dict[int, Dict] = {}
|
||||
|
||||
for idx, element in enumerate(elements):
|
||||
metadata = self._get_metadata()
|
||||
if hasattr(element, "metadata"):
|
||||
metadata.update(element.metadata.to_dict())
|
||||
page_number = metadata.get("page_number", 1)
|
||||
|
||||
# Check if this page_number already exists in docs_dict
|
||||
if page_number not in text_dict:
|
||||
# If not, create new entry with initial text and metadata
|
||||
text_dict[page_number] = str(element) + "\n\n"
|
||||
meta_dict[page_number] = metadata
|
||||
else:
|
||||
# If exists, append to text and update the metadata
|
||||
text_dict[page_number] += str(element) + "\n\n"
|
||||
meta_dict[page_number].update(metadata)
|
||||
|
||||
# Convert the dict to a list of Document objects
|
||||
docs = [
|
||||
Document(page_content=text_dict[key], metadata=meta_dict[key])
|
||||
for key in text_dict.keys()
|
||||
]
|
||||
elif self.mode == "single":
|
||||
metadata = self._get_metadata()
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
|
@ -11,7 +11,25 @@ from langchain.document_loaders import (
|
||||
)
|
||||
|
||||
|
||||
def test_unstructured_pdf_loader() -> None:
|
||||
def test_unstructured_pdf_loader_elements_mode() -> None:
|
||||
"""Test unstructured loader with various modes."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path), mode="elements")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 2
|
||||
|
||||
|
||||
def test_unstructured_pdf_loader_paged_mode() -> None:
|
||||
"""Test unstructured loader with various modes."""
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path), mode="paged")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 16
|
||||
|
||||
|
||||
def test_unstructured_pdf_loader_default_mode() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path))
|
||||
|
Loading…
Reference in New Issue
Block a user