Files
DB-GPT/pilot/embedding_engine/ppt_loader.py
aries_ckt 795be61d4a style:fmt
2023-07-25 17:22:11 +08:00

29 lines
905 B
Python

from typing import List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from pptx import Presentation
class PPTLoader(BaseLoader):
"""Load PPT files."""
def __init__(self, file_path: str, encoding: Optional[str] = None):
"""Initialize with file path."""
self.file_path = file_path
self.encoding = encoding
def load(self) -> List[Document]:
"""Load from file path."""
pr = Presentation(self.file_path)
docs = []
for slide in pr.slides:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text is not "":
docs.append(
Document(
page_content=shape.text, metadata={"source": slide.slide_id}
)
)
return docs