mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-13 06:40:04 +00:00
cr
This commit is contained in:
parent
05d125ac23
commit
27c373f8c3
@ -1,7 +1,8 @@
|
||||
"""Loads a PDF with pypdf and chunks at character level."""
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class PagedPDFSplitter(BaseLoader):
|
||||
@ -10,7 +11,6 @@ class PagedPDFSplitter(BaseLoader):
|
||||
Loader also stores page numbers in metadatas.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
|
@ -8,15 +8,12 @@ from langchain.vectorstores import FAISS
|
||||
|
||||
def test_pdf_pagesplitter() -> None:
|
||||
"""Test splitting with page numbers included."""
|
||||
loader = PagedPDFSplitter(chunk_size=250)
|
||||
script_dir = os.path.dirname(__file__)
|
||||
splits, metadatas = loader.load_and_split(
|
||||
os.path.join(script_dir, "examples/hello.pdf")
|
||||
)
|
||||
assert "pages" in metadatas[0]
|
||||
assert "key" in metadatas[0]
|
||||
assert len(splits) == len(metadatas)
|
||||
loader = PagedPDFSplitter(os.path.join(script_dir, "examples/hello.pdf"))
|
||||
docs = loader.load()
|
||||
assert "page" in docs[0].metadata
|
||||
assert "source" in docs[0].metadata
|
||||
|
||||
faiss_index = FAISS.from_texts(splits, OpenAIEmbeddings(), metadatas=metadatas)
|
||||
faiss_index = FAISS.from_documents(docs, OpenAIEmbeddings())
|
||||
docs = faiss_index.similarity_search("Complete this sentence: Hello", k=1)
|
||||
assert "Hello World" in docs[0].page_content
|
||||
assert "Hello world" in docs[0].page_content
|
||||
|
Loading…
Reference in New Issue
Block a user