community[patch]: added content_columns option to CSVLoader (#23809)

**Description:** Adding a new option to the CSVLoader that allows us to implicitly specify the columns that are used for generating the Document content. Currently these are implicitly set as "all fields not part of the metadata_columns". In some cases however it is useful to have a field both as a metadata and as part of the document content.
2025-09-06 21:43:44 +00:00 · 2024-09-02 22:25:53 +02:00
parent ab527027ac
commit 6a8f8a56ac
2 changed files with 31 additions and 1 deletions
--- a/libs/community/tests/unit_tests/document_loaders/test_csv_loader.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_csv_loader.py
@@ -108,6 +108,27 @@ class TestCSVLoader:
        # Assert
        assert result == expected_docs

+    def test_csv_loader_content_columns(self) -> None:
+        # Setup
+        file_path = self._get_csv_file_path("test_none_col.csv")
+        expected_docs = [
+            Document(
+                page_content="column1: value1\n" "column3: value3",
+                metadata={"source": file_path, "row": 0},
+            ),
+            Document(
+                page_content="column1: value6\n" "column3: value8",
+                metadata={"source": file_path, "row": 1},
+            ),
+        ]
+
+        # Exercise
+        loader = CSVLoader(file_path=file_path, content_columns=("column1", "column3"))
+        result = loader.load()
+
+        # Assert
+        assert result == expected_docs
+
    # utility functions
    def _get_csv_file_path(self, file_name: str) -> str:
        return str(Path(__file__).resolve().parent / "test_docs" / "csv" / file_name)