This commit is contained in:
Harrison Chase 2023-02-09 23:47:33 -08:00
parent 05d125ac23
commit 27c373f8c3
2 changed files with 8 additions and 11 deletions

View File

@ -1,7 +1,8 @@
"""Loads a PDF with pypdf and chunks at character level."""
from typing import Dict, List, Optional, Tuple
from langchain.document_loaders.base import BaseLoader
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class PagedPDFSplitter(BaseLoader):
@ -10,7 +11,6 @@ class PagedPDFSplitter(BaseLoader):
Loader also stores page numbers in metadatas.
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
try:

View File

@ -8,15 +8,12 @@ from langchain.vectorstores import FAISS
def test_pdf_pagesplitter() -> None:
"""Test splitting with page numbers included."""
loader = PagedPDFSplitter(chunk_size=250)
script_dir = os.path.dirname(__file__)
splits, metadatas = loader.load_and_split(
os.path.join(script_dir, "examples/hello.pdf")
)
assert "pages" in metadatas[0]
assert "key" in metadatas[0]
assert len(splits) == len(metadatas)
loader = PagedPDFSplitter(os.path.join(script_dir, "examples/hello.pdf"))
docs = loader.load()
assert "page" in docs[0].metadata
assert "source" in docs[0].metadata
faiss_index = FAISS.from_texts(splits, OpenAIEmbeddings(), metadatas=metadatas)
faiss_index = FAISS.from_documents(docs, OpenAIEmbeddings())
docs = faiss_index.similarity_search("Complete this sentence: Hello", k=1)
assert "Hello World" in docs[0].page_content
assert "Hello world" in docs[0].page_content