Merge pull request #292 from jiangzhuo/feature/multiprocessing-for-document-loading

Optimize load_documents function with multiprocessing
This commit is contained in:
Iván Martínez
2023-05-20 10:57:42 +02:00
committed by GitHub
2 changed files with 13 additions and 3 deletions

View File

@@ -3,6 +3,8 @@ import os
import glob
from typing import List
from dotenv import load_dotenv
from multiprocessing import Pool
from tqdm import tqdm
from langchain.document_loaders import (
CSVLoader,
@@ -79,7 +81,6 @@ def load_single_document(file_path: str) -> Document:
raise ValueError(f"Unsupported file extension '{ext}'")
def load_documents(source_dir: str) -> List[Document]:
# Loads all documents from source documents directory
all_files = []
@@ -87,7 +88,15 @@ def load_documents(source_dir: str) -> List[Document]:
all_files.extend(
glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
)
return [load_single_document(file_path) for file_path in all_files]
with Pool(processes=os.cpu_count()) as pool:
results = []
with tqdm(total=len(all_files), desc='Loading documents', ncols=80) as pbar:
for i, doc in enumerate(pool.imap_unordered(load_single_document, all_files)):
results.append(doc)
pbar.update()
return results
def main():