community: Fix CSVLoader columns is None (#20701)

- **Bug code**: In
langchain_community/document_loaders/csv_loader.py:100

- **Description**: currently, when 'CSVLoader' reads the column as None
in the 'csv' file, it will report an error because the 'CSVLoader' does
not verify whether the column is of str type and does not consider how
to handle the corresponding 'row_data' when the column is' None 'in the
csv. This pr provides a solution.

- **Issue:**  Fix #20699 

- **thinking:**

1. Refer to the processing method for
'langchain_community/document_loaders/csv_loader.py:100' when **'v'**
equals'None', and apply the same method to '**k**'.
(Reference`csv.DictReader` ,**'k'** will only be None when `
len(columns) < len(number_row_data)` is established)
2. **‘k’** equals None only holds when it is the last column, and its
corresponding **'v'** type is a list. Therefore, I referred to the data
format in 'Document' and used ',' to concatenated the elements in the
list.(But I'm not sure if you accept this form, if you have any other
ideas, communicate)

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
maang-h 2024-05-23 03:57:46 +08:00 committed by GitHub
parent 403142eaba
commit fc93bed8c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 59 additions and 1 deletions

View File

@ -97,7 +97,9 @@ class CSVLoader(BaseLoader):
f"Source column '{self.source_column}' not found in CSV file."
)
content = "\n".join(
f"{k.strip()}: {v.strip() if v is not None else v}"
f"""{k.strip() if k is not None else k}: {v.strip()
if isinstance(v, str) else ','.join(map(str.strip, v))
if isinstance(v, list) else v}"""
for k, v in row.items()
if k not in self.metadata_columns
)

View File

@ -85,6 +85,29 @@ class TestCSVLoader:
# Assert
assert result == expected_docs
def test_csv_loader_load_none_column_file(self) -> None:
# Setup
file_path = self._get_csv_file_path("test_none_col.csv")
expected_docs = [
Document(
page_content="column1: value1\ncolumn2: value2\n"
"column3: value3\nNone: value4,value5",
metadata={"source": file_path, "row": 0},
),
Document(
page_content="column1: value6\ncolumn2: value7\n"
"column3: value8\nNone: value9",
metadata={"source": file_path, "row": 1},
),
]
# Exercise
loader = CSVLoader(file_path=file_path)
result = loader.load()
# Assert
assert result == expected_docs
# utility functions
def _get_csv_file_path(self, file_name: str) -> str:
return str(Path(__file__).resolve().parent / "test_docs" / "csv" / file_name)

View File

@ -58,6 +58,22 @@ class TestDirectoryLoader:
"row": 1,
},
),
Document(
page_content="column1: value1\ncolumn2: value2\n"
"column3: value3\nNone: value4,value5",
metadata={
"source": self._get_csv_file_path("test_none_col.csv"),
"row": 0,
},
),
Document(
page_content="column1: value6\ncolumn2: value7\n"
"column3: value8\nNone: value9",
metadata={
"source": self._get_csv_file_path("test_none_col.csv"),
"row": 1,
},
),
]
loaded_docs = sorted(loader.load(), key=lambda doc: doc.metadata["source"])
@ -141,6 +157,20 @@ class TestDirectoryLoader:
metadata={"source": file_path, "row": 0},
)
]
file_name = "test_none_col.csv"
file_path = self._get_csv_file_path(file_name)
expected_docs += [
Document(
page_content="column1: value1\ncolumn2: value2\n"
"column3: value3\nNone: value4,value5",
metadata={"source": file_path, "row": 0},
),
Document(
page_content="column1: value6\ncolumn2: value7\n"
"column3: value8\nNone: value9",
metadata={"source": file_path, "row": 1},
),
]
# Assert
loader = DirectoryLoader(dir_path, loader_cls=CSVLoader)

View File

@ -0,0 +1,3 @@
column1,column2,column3
value1,value2,value3,value4,value5
value6,value7,value8,value9
1 column1,column2,column3
2 value1,value2,value3,value4,value5
3 value6,value7,value8,value9