mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-15 07:36:08 +00:00
cr
This commit is contained in:
parent
05d125ac23
commit
27c373f8c3
@ -1,7 +1,8 @@
|
|||||||
"""Loads a PDF with pypdf and chunks at character level."""
|
"""Loads a PDF with pypdf and chunks at character level."""
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple
|
||||||
from langchain.document_loaders.base import BaseLoader
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
class PagedPDFSplitter(BaseLoader):
|
class PagedPDFSplitter(BaseLoader):
|
||||||
@ -10,7 +11,6 @@ class PagedPDFSplitter(BaseLoader):
|
|||||||
Loader also stores page numbers in metadatas.
|
Loader also stores page numbers in metadatas.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
try:
|
try:
|
||||||
|
@ -8,15 +8,12 @@ from langchain.vectorstores import FAISS
|
|||||||
|
|
||||||
def test_pdf_pagesplitter() -> None:
|
def test_pdf_pagesplitter() -> None:
|
||||||
"""Test splitting with page numbers included."""
|
"""Test splitting with page numbers included."""
|
||||||
loader = PagedPDFSplitter(chunk_size=250)
|
|
||||||
script_dir = os.path.dirname(__file__)
|
script_dir = os.path.dirname(__file__)
|
||||||
splits, metadatas = loader.load_and_split(
|
loader = PagedPDFSplitter(os.path.join(script_dir, "examples/hello.pdf"))
|
||||||
os.path.join(script_dir, "examples/hello.pdf")
|
docs = loader.load()
|
||||||
)
|
assert "page" in docs[0].metadata
|
||||||
assert "pages" in metadatas[0]
|
assert "source" in docs[0].metadata
|
||||||
assert "key" in metadatas[0]
|
|
||||||
assert len(splits) == len(metadatas)
|
|
||||||
|
|
||||||
faiss_index = FAISS.from_texts(splits, OpenAIEmbeddings(), metadatas=metadatas)
|
faiss_index = FAISS.from_documents(docs, OpenAIEmbeddings())
|
||||||
docs = faiss_index.similarity_search("Complete this sentence: Hello", k=1)
|
docs = faiss_index.similarity_search("Complete this sentence: Hello", k=1)
|
||||||
assert "Hello World" in docs[0].page_content
|
assert "Hello world" in docs[0].page_content
|
||||||
|
Loading…
Reference in New Issue
Block a user