mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-15 23:57:21 +00:00
cr
This commit is contained in:
parent
9011f690c6
commit
05d125ac23
@ -1,7 +1,6 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"attachments": {},
|
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -19,20 +18,17 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from langchain.document_loaders import PagedPDFSplitter\n",
|
"from langchain.document_loaders import PagedPDFSplitter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"loader = PagedPDFSplitter(chunk_size=250)\n",
|
"loader = PagedPDFSplitter(\"example_data/layout-parser-paper.pdf\")\n",
|
||||||
"splits, metadatas = loader.load_and_split(\n",
|
"pages = loader.load_and_split()"
|
||||||
" \"examples/example_data/layout-parser-paper.pdf\"\n",
|
|
||||||
")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"attachments": {},
|
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
@ -43,18 +39,97 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"9: 10 Z. Shen et al.\n",
|
||||||
|
"Fig. 4: Illustration of (a) the original historical Japanese document with layout\n",
|
||||||
|
"detection results and (b) a recreated version of the document image that achieves\n",
|
||||||
|
"much better character recognition recall. The reorganization algorithm rearranges\n",
|
||||||
|
"the tokens based on the their detected bounding boxes given a maximum allowed\n",
|
||||||
|
"height.\n",
|
||||||
|
"4LayoutParser Community Platform\n",
|
||||||
|
"Another focus of LayoutParser is promoting the reusability of layout detection\n",
|
||||||
|
"models and full digitization pipelines. Similar to many existing deep learning\n",
|
||||||
|
"libraries, LayoutParser comes with a community model hub for distributing\n",
|
||||||
|
"layout models. End-users can upload their self-trained models to the model hub,\n",
|
||||||
|
"and these models can be loaded into a similar interface as the currently available\n",
|
||||||
|
"LayoutParser pre-trained models. For example, the model trained on the News\n",
|
||||||
|
"Navigator dataset [17] has been incorporated in the model hub.\n",
|
||||||
|
"Beyond DL models, LayoutParser also promotes the sharing of entire doc-\n",
|
||||||
|
"ument digitization pipelines. For example, sometimes the pipeline requires the\n",
|
||||||
|
"combination of multiple DL models to achieve better accuracy. Currently, pipelines\n",
|
||||||
|
"are mainly described in academic papers and implementations are often not pub-\n",
|
||||||
|
"licly available. To this end, the LayoutParser community platform also enables\n",
|
||||||
|
"the sharing of layout pipelines to promote the discussion and reuse of techniques.\n",
|
||||||
|
"For each shared pipeline, it has a dedicated project page, with links to the source\n",
|
||||||
|
"code, documentation, and an outline of the approaches. A discussion panel is\n",
|
||||||
|
"provided for exchanging ideas. Combined with the core LayoutParser library,\n",
|
||||||
|
"users can easily build reusable components based on the shared pipelines and\n",
|
||||||
|
"apply them to solve their unique problems.\n",
|
||||||
|
"5 Use Cases\n",
|
||||||
|
"The core objective of LayoutParser is to make it easier to create both large-scale\n",
|
||||||
|
"and light-weight document digitization pipelines. Large-scale document processing\n",
|
||||||
|
"3: 4 Z. Shen et al.\n",
|
||||||
|
"Efficient Data AnnotationC u s t o m i z e d M o d e l T r a i n i n gModel Cust omizationDI A Model HubDI A Pipeline SharingCommunity PlatformLa y out Detection ModelsDocument Images \n",
|
||||||
|
"T h e C o r e L a y o u t P a r s e r L i b r a r yOCR ModuleSt or age & VisualizationLa y out Data Structur e\n",
|
||||||
|
"Fig. 1: The overall architecture of LayoutParser . For an input document image,\n",
|
||||||
|
"the core LayoutParser library provides a set of o\u000b",
|
||||||
|
"-the-shelf tools for layout\n",
|
||||||
|
"detection, OCR, visualization, and storage, backed by a carefully designed layout\n",
|
||||||
|
"data structure. LayoutParser also supports high level customization via e\u000ecient\n",
|
||||||
|
"layout annotation and model training functions. These improve model accuracy\n",
|
||||||
|
"on the target samples. The community platform enables the easy sharing of DIA\n",
|
||||||
|
"models and whole digitization pipelines to promote reusability and reproducibility.\n",
|
||||||
|
"A collection of detailed documentation, tutorials and exemplar projects make\n",
|
||||||
|
"LayoutParser easy to learn and use.\n",
|
||||||
|
"AllenNLP [ 8] and transformers [ 34] have provided the community with complete\n",
|
||||||
|
"DL-based support for developing and deploying models for general computer\n",
|
||||||
|
"vision and natural language processing problems. LayoutParser , on the other\n",
|
||||||
|
"hand, specializes speci\f",
|
||||||
|
"cally in DIA tasks. LayoutParser is also equipped with a\n",
|
||||||
|
"community platform inspired by established model hubs such as Torch Hub [23]\n",
|
||||||
|
"andTensorFlow Hub [1]. It enables the sharing of pretrained models as well as\n",
|
||||||
|
"full document processing pipelines that are unique to DIA tasks.\n",
|
||||||
|
"There have been a variety of document data collections to facilitate the\n",
|
||||||
|
"development of DL models. Some examples include PRImA [ 3](magazine layouts),\n",
|
||||||
|
"PubLayNet [ 38](academic paper layouts), Table Bank [ 18](tables in academic\n",
|
||||||
|
"papers), Newspaper Navigator Dataset [ 16,17](newspaper \f",
|
||||||
|
"gure layouts) and\n",
|
||||||
|
"HJDataset [31](historical Japanese document layouts). A spectrum of models\n",
|
||||||
|
"trained on these datasets are currently available in the LayoutParser model zoo\n",
|
||||||
|
"to support di\u000b",
|
||||||
|
"erent use cases.\n",
|
||||||
|
"3 The Core LayoutParser Library\n",
|
||||||
|
"At the core of LayoutParser is an o\u000b",
|
||||||
|
"-the-shelf toolkit that streamlines DL-\n",
|
||||||
|
"based document image analysis. Five components support a simple interface\n",
|
||||||
|
"with comprehensive functionalities: 1) The layout detection models enable using\n",
|
||||||
|
"pre-trained or self-trained DL models for layout detection with just four lines\n",
|
||||||
|
"of code. 2) The detected layout information is stored in carefully engineered\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from langchain.vectorstores import FAISS\n",
|
"from langchain.vectorstores import FAISS\n",
|
||||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||||
"\n",
|
"\n",
|
||||||
"faiss_index = FAISS.from_texts(splits, OpenAIEmbeddings(), metadatas=metadatas)\n",
|
"faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())\n",
|
||||||
"docs = faiss_index.similarity_search(\"How will the community be engaged?\", k=2)\n",
|
"docs = faiss_index.similarity_search(\"How will the community be engaged?\", k=2)\n",
|
||||||
"for doc in docs:\n",
|
"for doc in docs:\n",
|
||||||
" print(doc.metadata[\"pages\"] + \":\", doc.page_content)"
|
" print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -73,9 +148,8 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.0"
|
"version": "3.9.1"
|
||||||
},
|
},
|
||||||
"orig_nbformat": 4,
|
|
||||||
"vscode": {
|
"vscode": {
|
||||||
"interpreter": {
|
"interpreter": {
|
||||||
"hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a"
|
"hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a"
|
||||||
|
@ -1,14 +1,17 @@
|
|||||||
"""Loads a PDF with pypdf and chunks at character level."""
|
"""Loads a PDF with pypdf and chunks at character level."""
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
|
||||||
|
|
||||||
class PagedPDFSplitter:
|
class PagedPDFSplitter(BaseLoader):
|
||||||
"""Loads a PDF with pypdf and chunks at character level.
|
"""Loads a PDF with pypdf and chunks at character level.
|
||||||
|
|
||||||
Loader also stores page numbers in metadatas.
|
Loader also stores page numbers in metadatas.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, chunk_size: int = 4000, chunk_overlap: int = 200):
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
try:
|
try:
|
||||||
import pypdf # noqa:F401
|
import pypdf # noqa:F401
|
||||||
@ -16,41 +19,18 @@ class PagedPDFSplitter:
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||||
)
|
)
|
||||||
self.chunk_size = chunk_size
|
self._file_path = file_path
|
||||||
self.chunk_overlap = chunk_overlap
|
|
||||||
|
|
||||||
def load_and_split(
|
def load(self) -> List[Document]:
|
||||||
self, file_path: str, metadata: Optional[Dict] = None
|
"""Load given path as pages."""
|
||||||
) -> Tuple[List[str], List[Dict]]:
|
|
||||||
"""Load given path and split into texts and metadatas.
|
|
||||||
|
|
||||||
If given, the metadata given will
|
|
||||||
be duplicated and attached to each split along with page number.
|
|
||||||
If key is present in metadata, it also has page number
|
|
||||||
included (e.g., Foo2012 Pages 3-4).
|
|
||||||
"""
|
|
||||||
import pypdf
|
import pypdf
|
||||||
|
|
||||||
pdfFileObj = open(file_path, "rb")
|
pdf_file_obj = open(self._file_path, "rb")
|
||||||
pdfReader = pypdf.PdfReader(pdfFileObj)
|
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
||||||
splits = []
|
docs = []
|
||||||
split = ""
|
for i, page in enumerate(pdf_reader.pages):
|
||||||
pages = []
|
text = page.extract_text()
|
||||||
metadatas = []
|
metadata = {"source": self._file_path, "page": i}
|
||||||
key = (
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
metadata["key"] if metadata is not None and "key" in metadata else file_path
|
pdf_file_obj.close()
|
||||||
)
|
return docs
|
||||||
for i, page in enumerate(pdfReader.pages):
|
|
||||||
split += page.extract_text()
|
|
||||||
pages.append(str(i + 1))
|
|
||||||
if len(split) > self.chunk_size or i == len(pdfReader.pages) - 1:
|
|
||||||
splits.append(split[: self.chunk_size])
|
|
||||||
# pretty formatting of pages (e.g. 1-3, 4, 5-7)
|
|
||||||
pg = "-".join([pages[0], pages[-1]])
|
|
||||||
metadatas.append(dict(key=f"{key} pages {pg}", pages=pg))
|
|
||||||
if metadata is not None:
|
|
||||||
metadatas[-1].update(metadata)
|
|
||||||
split = str(splits[self.chunk_size: self.chunk_overlap])
|
|
||||||
pages = [str(i + 1)]
|
|
||||||
pdfFileObj.close()
|
|
||||||
return splits, metadatas
|
|
||||||
|
2
poetry.lock
generated
2
poetry.lock
generated
@ -7008,4 +7008,4 @@ llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8.1,<4.0"
|
python-versions = ">=3.8.1,<4.0"
|
||||||
content-hash = "b190d518d7a99484ccb0aaf0ed43dfaf9c8cc74481b00da9d4fadd9d02c0dda2"
|
content-hash = "55ff8e2f70840a299ca72a27468cf18ec732514bdc2aa2ed9e8faf9bc5caa71f"
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
"""Test splitting with page numbers included."""
|
"""Test splitting with page numbers included."""
|
||||||
|
import os
|
||||||
|
|
||||||
from langchain.document_loaders import PagedPDFSplitter
|
from langchain.document_loaders import PagedPDFSplitter
|
||||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||||
from langchain.vectorstores import FAISS
|
from langchain.vectorstores import FAISS
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
def test_pdf_pagesplitter() -> None:
|
def test_pdf_pagesplitter() -> None:
|
||||||
@ -10,7 +11,8 @@ def test_pdf_pagesplitter() -> None:
|
|||||||
loader = PagedPDFSplitter(chunk_size=250)
|
loader = PagedPDFSplitter(chunk_size=250)
|
||||||
script_dir = os.path.dirname(__file__)
|
script_dir = os.path.dirname(__file__)
|
||||||
splits, metadatas = loader.load_and_split(
|
splits, metadatas = loader.load_and_split(
|
||||||
os.path.join(script_dir, "examples/hello.pdf"))
|
os.path.join(script_dir, "examples/hello.pdf")
|
||||||
|
)
|
||||||
assert "pages" in metadatas[0]
|
assert "pages" in metadatas[0]
|
||||||
assert "key" in metadatas[0]
|
assert "key" in metadatas[0]
|
||||||
assert len(splits) == len(metadatas)
|
assert len(splits) == len(metadatas)
|
||||||
|
Loading…
Reference in New Issue
Block a user