Optimize load_documents function with multiprocessing

This commit is contained in:
jiangzhuo 2023-05-19 02:35:20 +09:00
parent ad64589c8f
commit 81b221bccb

View File

@ -2,6 +2,7 @@ import os
import glob import glob
from typing import List from typing import List
from dotenv import load_dotenv from dotenv import load_dotenv
from multiprocessing import Pool
from langchain.document_loaders import ( from langchain.document_loaders import (
CSVLoader, CSVLoader,
@ -64,7 +65,9 @@ def load_documents(source_dir: str) -> List[Document]:
all_files.extend( all_files.extend(
glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True) glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
) )
return [load_single_document(file_path) for file_path in all_files] with Pool(processes=os.cpu_count()) as pool:
documents = pool.map(load_single_document, all_files)
return documents
def main(): def main():