From e818c75f8aeaeed1fdb1f945543982b4816e43fb Mon Sep 17 00:00:00 2001 From: Chip Davis <62909360+chip-davis@users.noreply.github.com> Date: Fri, 26 Apr 2024 21:16:47 -0500 Subject: [PATCH] infra: test directory loader multithreaded (#20281) This is a unit test for #20230 which was a fix for using multithreaded mode with directory loader @eyurtsev --- .../document_loaders/test_directory_loader.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/libs/community/tests/unit_tests/document_loaders/test_directory_loader.py b/libs/community/tests/unit_tests/document_loaders/test_directory_loader.py index 3793878297a..d8795a6ce3b 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_directory_loader.py +++ b/libs/community/tests/unit_tests/document_loaders/test_directory_loader.py @@ -8,6 +8,64 @@ from langchain_community.document_loaders.directory import DirectoryLoader class TestDirectoryLoader: + # Tests that when multhreading is enabled, multiple documents are read successfully. + def test_directory_loader_with_multithreading_enabled(self) -> None: + dir_path = self._get_csv_dir_path() + loader = DirectoryLoader( + dir_path, glob="**/*.csv", loader_cls=CSVLoader, use_multithreading=True + ) + + expected_docs = [ + Document( + page_content="column1: value1", + metadata={ + "source": self._get_csv_file_path("test_one_col.csv"), + "row": 0, + }, + ), + Document( + page_content="column1: value2", + metadata={ + "source": self._get_csv_file_path("test_one_col.csv"), + "row": 1, + }, + ), + Document( + page_content="column1: value3", + metadata={ + "source": self._get_csv_file_path("test_one_col.csv"), + "row": 2, + }, + ), + Document( + page_content="column1: value1\ncolumn2: value2\ncolumn3: value3", + metadata={ + "source": self._get_csv_file_path("test_one_row.csv"), + "row": 0, + }, + ), + Document( + page_content="column1: value1\ncolumn2: value2\ncolumn3: value3", + metadata={ + "source": self._get_csv_file_path("test_nominal.csv"), + "row": 0, + }, + ), + Document( + page_content="column1: value4\ncolumn2: value5\ncolumn3: value6", + metadata={ + "source": self._get_csv_file_path("test_nominal.csv"), + "row": 1, + }, + ), + ] + + loaded_docs = sorted(loader.load(), key=lambda doc: doc.metadata["source"]) + expected_docs = sorted(expected_docs, key=lambda doc: doc.metadata["source"]) + + for i, doc in enumerate(loaded_docs): + assert doc == expected_docs[i] + # Tests that lazy loading a CSV file with multiple documents is successful. def test_directory_loader_lazy_load_single_file_multiple_docs(self) -> None: # Setup