From 27c373f8c32d81cbd27dd6e008a7de807d33410e Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Thu, 9 Feb 2023 23:47:33 -0800 Subject: [PATCH] cr --- langchain/document_loaders/paged_pdf.py | 4 ++-- tests/integration_tests/test_pdf_pagesplitter.py | 15 ++++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/langchain/document_loaders/paged_pdf.py b/langchain/document_loaders/paged_pdf.py index 940f2965f78..8eec10f70b8 100644 --- a/langchain/document_loaders/paged_pdf.py +++ b/langchain/document_loaders/paged_pdf.py @@ -1,7 +1,8 @@ """Loads a PDF with pypdf and chunks at character level.""" from typing import Dict, List, Optional, Tuple -from langchain.document_loaders.base import BaseLoader + from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader class PagedPDFSplitter(BaseLoader): @@ -10,7 +11,6 @@ class PagedPDFSplitter(BaseLoader): Loader also stores page numbers in metadatas. """ - def __init__(self, file_path: str): """Initialize with file path.""" try: diff --git a/tests/integration_tests/test_pdf_pagesplitter.py b/tests/integration_tests/test_pdf_pagesplitter.py index ba46385b531..f022597754a 100644 --- a/tests/integration_tests/test_pdf_pagesplitter.py +++ b/tests/integration_tests/test_pdf_pagesplitter.py @@ -8,15 +8,12 @@ from langchain.vectorstores import FAISS def test_pdf_pagesplitter() -> None: """Test splitting with page numbers included.""" - loader = PagedPDFSplitter(chunk_size=250) script_dir = os.path.dirname(__file__) - splits, metadatas = loader.load_and_split( - os.path.join(script_dir, "examples/hello.pdf") - ) - assert "pages" in metadatas[0] - assert "key" in metadatas[0] - assert len(splits) == len(metadatas) + loader = PagedPDFSplitter(os.path.join(script_dir, "examples/hello.pdf")) + docs = loader.load() + assert "page" in docs[0].metadata + assert "source" in docs[0].metadata - faiss_index = FAISS.from_texts(splits, OpenAIEmbeddings(), metadatas=metadatas) + faiss_index = FAISS.from_documents(docs, OpenAIEmbeddings()) docs = faiss_index.similarity_search("Complete this sentence: Hello", k=1) - assert "Hello World" in docs[0].page_content + assert "Hello world" in docs[0].page_content