From fc93bed8c491da3b7d0dec3b1e2db81e90eb2ed3 Mon Sep 17 00:00:00 2001 From: maang-h <55082429+maang-h@users.noreply.github.com> Date: Thu, 23 May 2024 03:57:46 +0800 Subject: [PATCH] community: Fix CSVLoader columns is None (#20701) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - **Bug code**: In langchain_community/document_loaders/csv_loader.py:100 - **Description**: currently, when 'CSVLoader' reads the column as None in the 'csv' file, it will report an error because the 'CSVLoader' does not verify whether the column is of str type and does not consider how to handle the corresponding 'row_data' when the column is' None 'in the csv. This pr provides a solution. - **Issue:** Fix #20699 - **thinking:** 1. Refer to the processing method for 'langchain_community/document_loaders/csv_loader.py:100' when **'v'** equals'None', and apply the same method to '**k**'. (Reference`csv.DictReader` ,**'k'** will only be None when ` len(columns) < len(number_row_data)` is established) 2. **‘k’** equals None only holds when it is the last column, and its corresponding **'v'** type is a list. Therefore, I referred to the data format in 'Document' and used ',' to concatenated the elements in the list.(But I'm not sure if you accept this form, if you have any other ideas, communicate) --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> --- .../document_loaders/csv_loader.py | 4 ++- .../document_loaders/test_csv_loader.py | 23 ++++++++++++++ .../document_loaders/test_directory_loader.py | 30 +++++++++++++++++++ .../test_docs/csv/test_none_col.csv | 3 ++ 4 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 libs/community/tests/unit_tests/document_loaders/test_docs/csv/test_none_col.csv diff --git a/libs/community/langchain_community/document_loaders/csv_loader.py b/libs/community/langchain_community/document_loaders/csv_loader.py index fca2f1f0f9b..37e6f565531 100644 --- a/libs/community/langchain_community/document_loaders/csv_loader.py +++ b/libs/community/langchain_community/document_loaders/csv_loader.py @@ -97,7 +97,9 @@ class CSVLoader(BaseLoader): f"Source column '{self.source_column}' not found in CSV file." ) content = "\n".join( - f"{k.strip()}: {v.strip() if v is not None else v}" + f"""{k.strip() if k is not None else k}: {v.strip() + if isinstance(v, str) else ','.join(map(str.strip, v)) + if isinstance(v, list) else v}""" for k, v in row.items() if k not in self.metadata_columns ) diff --git a/libs/community/tests/unit_tests/document_loaders/test_csv_loader.py b/libs/community/tests/unit_tests/document_loaders/test_csv_loader.py index a9d4212c15e..a7ab65e35a4 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_csv_loader.py +++ b/libs/community/tests/unit_tests/document_loaders/test_csv_loader.py @@ -85,6 +85,29 @@ class TestCSVLoader: # Assert assert result == expected_docs + def test_csv_loader_load_none_column_file(self) -> None: + # Setup + file_path = self._get_csv_file_path("test_none_col.csv") + expected_docs = [ + Document( + page_content="column1: value1\ncolumn2: value2\n" + "column3: value3\nNone: value4,value5", + metadata={"source": file_path, "row": 0}, + ), + Document( + page_content="column1: value6\ncolumn2: value7\n" + "column3: value8\nNone: value9", + metadata={"source": file_path, "row": 1}, + ), + ] + + # Exercise + loader = CSVLoader(file_path=file_path) + result = loader.load() + + # Assert + assert result == expected_docs + # utility functions def _get_csv_file_path(self, file_name: str) -> str: return str(Path(__file__).resolve().parent / "test_docs" / "csv" / file_name) diff --git a/libs/community/tests/unit_tests/document_loaders/test_directory_loader.py b/libs/community/tests/unit_tests/document_loaders/test_directory_loader.py index d8795a6ce3b..e385e77b33e 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_directory_loader.py +++ b/libs/community/tests/unit_tests/document_loaders/test_directory_loader.py @@ -58,6 +58,22 @@ class TestDirectoryLoader: "row": 1, }, ), + Document( + page_content="column1: value1\ncolumn2: value2\n" + "column3: value3\nNone: value4,value5", + metadata={ + "source": self._get_csv_file_path("test_none_col.csv"), + "row": 0, + }, + ), + Document( + page_content="column1: value6\ncolumn2: value7\n" + "column3: value8\nNone: value9", + metadata={ + "source": self._get_csv_file_path("test_none_col.csv"), + "row": 1, + }, + ), ] loaded_docs = sorted(loader.load(), key=lambda doc: doc.metadata["source"]) @@ -141,6 +157,20 @@ class TestDirectoryLoader: metadata={"source": file_path, "row": 0}, ) ] + file_name = "test_none_col.csv" + file_path = self._get_csv_file_path(file_name) + expected_docs += [ + Document( + page_content="column1: value1\ncolumn2: value2\n" + "column3: value3\nNone: value4,value5", + metadata={"source": file_path, "row": 0}, + ), + Document( + page_content="column1: value6\ncolumn2: value7\n" + "column3: value8\nNone: value9", + metadata={"source": file_path, "row": 1}, + ), + ] # Assert loader = DirectoryLoader(dir_path, loader_cls=CSVLoader) diff --git a/libs/community/tests/unit_tests/document_loaders/test_docs/csv/test_none_col.csv b/libs/community/tests/unit_tests/document_loaders/test_docs/csv/test_none_col.csv new file mode 100644 index 00000000000..a6a3d77e050 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_docs/csv/test_none_col.csv @@ -0,0 +1,3 @@ +column1,column2,column3 +value1,value2,value3,value4,value5 +value6,value7,value8,value9