Files
DB-GPT/pilot/embedding_engine/loader/docx_loader.py
aries_ckt d6a23ead3b fix:csv_loader bug
1.add new_csv_loader,override load()
2.add loader dir
Close #396
2023-08-03 13:02:26 +08:00

29 lines
833 B
Python

from typing import List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
import docx
class DocxLoader(BaseLoader):
"""Load docx files."""
def __init__(self, file_path: str, encoding: Optional[str] = None):
"""Initialize with file path."""
self.file_path = file_path
self.encoding = encoding
def load(self) -> List[Document]:
"""Load from file path."""
docs = []
doc = docx.Document(self.file_path)
content = []
for i in range(len(doc.paragraphs)):
para = doc.paragraphs[i]
text = para.text
content.append(text)
docs.append(
Document(page_content="".join(content), metadata={"source": self.file_path})
)
return docs