mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-01 20:49:17 +00:00
community: Fix CSVLoader columns is None (#20701)
- **Bug code**: In langchain_community/document_loaders/csv_loader.py:100 - **Description**: currently, when 'CSVLoader' reads the column as None in the 'csv' file, it will report an error because the 'CSVLoader' does not verify whether the column is of str type and does not consider how to handle the corresponding 'row_data' when the column is' None 'in the csv. This pr provides a solution. - **Issue:** Fix #20699 - **thinking:** 1. Refer to the processing method for 'langchain_community/document_loaders/csv_loader.py:100' when **'v'** equals'None', and apply the same method to '**k**'. (Reference`csv.DictReader` ,**'k'** will only be None when ` len(columns) < len(number_row_data)` is established) 2. **‘k’** equals None only holds when it is the last column, and its corresponding **'v'** type is a list. Therefore, I referred to the data format in 'Document' and used ',' to concatenated the elements in the list.(But I'm not sure if you accept this form, if you have any other ideas, communicate) --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
parent
403142eaba
commit
fc93bed8c4
@ -97,7 +97,9 @@ class CSVLoader(BaseLoader):
|
||||
f"Source column '{self.source_column}' not found in CSV file."
|
||||
)
|
||||
content = "\n".join(
|
||||
f"{k.strip()}: {v.strip() if v is not None else v}"
|
||||
f"""{k.strip() if k is not None else k}: {v.strip()
|
||||
if isinstance(v, str) else ','.join(map(str.strip, v))
|
||||
if isinstance(v, list) else v}"""
|
||||
for k, v in row.items()
|
||||
if k not in self.metadata_columns
|
||||
)
|
||||
|
@ -85,6 +85,29 @@ class TestCSVLoader:
|
||||
# Assert
|
||||
assert result == expected_docs
|
||||
|
||||
def test_csv_loader_load_none_column_file(self) -> None:
|
||||
# Setup
|
||||
file_path = self._get_csv_file_path("test_none_col.csv")
|
||||
expected_docs = [
|
||||
Document(
|
||||
page_content="column1: value1\ncolumn2: value2\n"
|
||||
"column3: value3\nNone: value4,value5",
|
||||
metadata={"source": file_path, "row": 0},
|
||||
),
|
||||
Document(
|
||||
page_content="column1: value6\ncolumn2: value7\n"
|
||||
"column3: value8\nNone: value9",
|
||||
metadata={"source": file_path, "row": 1},
|
||||
),
|
||||
]
|
||||
|
||||
# Exercise
|
||||
loader = CSVLoader(file_path=file_path)
|
||||
result = loader.load()
|
||||
|
||||
# Assert
|
||||
assert result == expected_docs
|
||||
|
||||
# utility functions
|
||||
def _get_csv_file_path(self, file_name: str) -> str:
|
||||
return str(Path(__file__).resolve().parent / "test_docs" / "csv" / file_name)
|
||||
|
@ -58,6 +58,22 @@ class TestDirectoryLoader:
|
||||
"row": 1,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="column1: value1\ncolumn2: value2\n"
|
||||
"column3: value3\nNone: value4,value5",
|
||||
metadata={
|
||||
"source": self._get_csv_file_path("test_none_col.csv"),
|
||||
"row": 0,
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="column1: value6\ncolumn2: value7\n"
|
||||
"column3: value8\nNone: value9",
|
||||
metadata={
|
||||
"source": self._get_csv_file_path("test_none_col.csv"),
|
||||
"row": 1,
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
loaded_docs = sorted(loader.load(), key=lambda doc: doc.metadata["source"])
|
||||
@ -141,6 +157,20 @@ class TestDirectoryLoader:
|
||||
metadata={"source": file_path, "row": 0},
|
||||
)
|
||||
]
|
||||
file_name = "test_none_col.csv"
|
||||
file_path = self._get_csv_file_path(file_name)
|
||||
expected_docs += [
|
||||
Document(
|
||||
page_content="column1: value1\ncolumn2: value2\n"
|
||||
"column3: value3\nNone: value4,value5",
|
||||
metadata={"source": file_path, "row": 0},
|
||||
),
|
||||
Document(
|
||||
page_content="column1: value6\ncolumn2: value7\n"
|
||||
"column3: value8\nNone: value9",
|
||||
metadata={"source": file_path, "row": 1},
|
||||
),
|
||||
]
|
||||
|
||||
# Assert
|
||||
loader = DirectoryLoader(dir_path, loader_cls=CSVLoader)
|
||||
|
@ -0,0 +1,3 @@
|
||||
column1,column2,column3
|
||||
value1,value2,value3,value4,value5
|
||||
value6,value7,value8,value9
|
|
Loading…
Reference in New Issue
Block a user