community: Fix CSVLoader columns is None (#20701)

- **Bug code**: In langchain_community/document_loaders/csv_loader.py:100 - **Description**: currently, when 'CSVLoader' reads the column as None in the 'csv' file, it will report an error because the 'CSVLoader' does not verify whether the column is of str type and does not consider how to handle the corresponding 'row_data' when the column is' None 'in the csv. This pr provides a solution. - **Issue:** Fix #20699 - **thinking:** 1. Refer to the processing method for 'langchain_community/document_loaders/csv_loader.py:100' when **'v'** equals'None', and apply the same method to '**k**'. (Reference`csv.DictReader` ,**'k'** will only be None when ` len(columns) < len(number_row_data)` is established) 2. **‘k’** equals None only holds when it is the last column, and its corresponding **'v'** type is a list. Therefore, I referred to the data format in 'Document' and used ',' to concatenated the elements in the list.(But I'm not sure if you accept this form, if you have any other ideas, communicate) --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
2025-07-17 02:03:44 +00:00 · 2024-05-23 03:57:46 +08:00 · 2024-05-23 03:57:46 +08:00 · fc93bed8c4
commit fc93bed8c4
parent 403142eaba
4 changed files with 59 additions and 1 deletions
--- a/libs/community/langchain_community/document_loaders/csv_loader.py
+++ b/libs/community/langchain_community/document_loaders/csv_loader.py
@ -97,7 +97,9 @@ class CSVLoader(BaseLoader):
                    f"Source column '{self.source_column}' not found in CSV file."
                )
            content = "\n".join(
-                f"{k.strip()}: {v.strip() if v is not None else v}"
+                f"""{k.strip() if k is not None else k}: {v.strip()
+                if isinstance(v, str) else ','.join(map(str.strip, v))
+                if isinstance(v, list) else v}"""
                for k, v in row.items()
                if k not in self.metadata_columns
            )
--- a/libs/community/tests/unit_tests/document_loaders/test_csv_loader.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_csv_loader.py
@ -85,6 +85,29 @@ class TestCSVLoader:
        # Assert
        assert result == expected_docs

+    def test_csv_loader_load_none_column_file(self) -> None:
+        # Setup
+        file_path = self._get_csv_file_path("test_none_col.csv")
+        expected_docs = [
+            Document(
+                page_content="column1: value1\ncolumn2: value2\n"
+                "column3: value3\nNone: value4,value5",
+                metadata={"source": file_path, "row": 0},
+            ),
+            Document(
+                page_content="column1: value6\ncolumn2: value7\n"
+                "column3: value8\nNone: value9",
+                metadata={"source": file_path, "row": 1},
+            ),
+        ]
+
+        # Exercise
+        loader = CSVLoader(file_path=file_path)
+        result = loader.load()
+
+        # Assert
+        assert result == expected_docs
+
    # utility functions
    def _get_csv_file_path(self, file_name: str) -> str:
        return str(Path(__file__).resolve().parent / "test_docs" / "csv" / file_name)
--- a/libs/community/tests/unit_tests/document_loaders/test_directory_loader.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_directory_loader.py
@ -58,6 +58,22 @@ class TestDirectoryLoader:
                    "row": 1,
                },
            ),
+            Document(
+                page_content="column1: value1\ncolumn2: value2\n"
+                "column3: value3\nNone: value4,value5",
+                metadata={
+                    "source": self._get_csv_file_path("test_none_col.csv"),
+                    "row": 0,
+                },
+            ),
+            Document(
+                page_content="column1: value6\ncolumn2: value7\n"
+                "column3: value8\nNone: value9",
+                metadata={
+                    "source": self._get_csv_file_path("test_none_col.csv"),
+                    "row": 1,
+                },
+            ),
        ]

        loaded_docs = sorted(loader.load(), key=lambda doc: doc.metadata["source"])
@ -141,6 +157,20 @@ class TestDirectoryLoader:
                metadata={"source": file_path, "row": 0},
            )
        ]
+        file_name = "test_none_col.csv"
+        file_path = self._get_csv_file_path(file_name)
+        expected_docs += [
+            Document(
+                page_content="column1: value1\ncolumn2: value2\n"
+                "column3: value3\nNone: value4,value5",
+                metadata={"source": file_path, "row": 0},
+            ),
+            Document(
+                page_content="column1: value6\ncolumn2: value7\n"
+                "column3: value8\nNone: value9",
+                metadata={"source": file_path, "row": 1},
+            ),
+        ]

        # Assert
        loader = DirectoryLoader(dir_path, loader_cls=CSVLoader)
--- a/libs/community/tests/unit_tests/document_loaders/test_docs/csv/test_none_col.csv
+++ b/libs/community/tests/unit_tests/document_loaders/test_docs/csv/test_none_col.csv
@ -0,0 +1,3 @@
+column1,column2,column3
+value1,value2,value3,value4,value5
+value6,value7,value8,value9