Added autodetect_encoding option to csvLoader (#11327)

This commit is contained in:
Taikono-Himazin 2023-10-10 00:06:43 +09:00 committed by GitHub
parent 09c66fe04f
commit eec53fa294
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 107 additions and 20 deletions

View File

@ -1,8 +1,10 @@
import csv import csv
from io import TextIOWrapper
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.document_loaders.unstructured import ( from langchain.document_loaders.unstructured import (
UnstructuredFileLoader, UnstructuredFileLoader,
validate_unstructured_version, validate_unstructured_version,
@ -36,6 +38,7 @@ class CSVLoader(BaseLoader):
source_column: Optional[str] = None, source_column: Optional[str] = None,
csv_args: Optional[Dict] = None, csv_args: Optional[Dict] = None,
encoding: Optional[str] = None, encoding: Optional[str] = None,
autodetect_encoding: bool = False,
): ):
""" """
@ -46,17 +49,42 @@ class CSVLoader(BaseLoader):
csv_args: A dictionary of arguments to pass to the csv.DictReader. csv_args: A dictionary of arguments to pass to the csv.DictReader.
Optional. Defaults to None. Optional. Defaults to None.
encoding: The encoding of the CSV file. Optional. Defaults to None. encoding: The encoding of the CSV file. Optional. Defaults to None.
autodetect_encoding: Whether to try to autodetect the file encoding.
""" """
self.file_path = file_path self.file_path = file_path
self.source_column = source_column self.source_column = source_column
self.encoding = encoding self.encoding = encoding
self.csv_args = csv_args or {} self.csv_args = csv_args or {}
self.autodetect_encoding = autodetect_encoding
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load data into document objects.""" """Load data into document objects."""
docs = [] docs = []
try:
with open(self.file_path, newline="", encoding=self.encoding) as csvfile: with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
docs = self.__read_file(csvfile)
except UnicodeDecodeError as e:
if self.autodetect_encoding:
detected_encodings = detect_file_encodings(self.file_path)
for encoding in detected_encodings:
try:
with open(
self.file_path, newline="", encoding=encoding.encoding
) as csvfile:
docs = self.__read_file(csvfile)
break
except UnicodeDecodeError:
continue
else:
raise RuntimeError(f"Error loading {self.file_path}") from e
except Exception as e:
raise RuntimeError(f"Error loading {self.file_path}") from e
return docs
def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
docs = []
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
for i, row in enumerate(csv_reader): for i, row in enumerate(csv_reader):
content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items()) content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())

View File

@ -2,12 +2,12 @@ from pathlib import Path
import pytest import pytest
from langchain.document_loaders import DirectoryLoader, TextLoader from langchain.document_loaders import CSVLoader, DirectoryLoader, TextLoader
from langchain.document_loaders.helpers import detect_file_encodings from langchain.document_loaders.helpers import detect_file_encodings
@pytest.mark.requires("chardet") @pytest.mark.requires("chardet")
def test_loader_detect_encoding() -> None: def test_loader_detect_encoding_text() -> None:
"""Test text loader.""" """Test text loader."""
path = Path(__file__).parent.parent / "examples" path = Path(__file__).parent.parent / "examples"
files = path.glob("**/*.txt") files = path.glob("**/*.txt")
@ -16,7 +16,7 @@ def test_loader_detect_encoding() -> None:
str(path), str(path),
glob="**/*.txt", glob="**/*.txt",
loader_kwargs={"autodetect_encoding": True}, loader_kwargs={"autodetect_encoding": True},
loader_cls=TextLoader, loader_cls=TextLoader, # type: ignore
) )
with pytest.raises((UnicodeDecodeError, RuntimeError)): with pytest.raises((UnicodeDecodeError, RuntimeError)):
@ -26,6 +26,43 @@ def test_loader_detect_encoding() -> None:
assert len(docs) == len(list(files)) assert len(docs) == len(list(files))
@pytest.mark.requires("chardet")
def test_loader_detect_encoding_csv() -> None:
"""Test csv loader."""
path = Path(__file__).parent.parent / "examples"
files = path.glob("**/*.csv")
# Count the number of lines.
row_count = 0
for file in files:
encodings = detect_file_encodings(str(file))
for encoding in encodings:
try:
row_count += sum(1 for line in open(file, encoding=encoding.encoding))
break
except UnicodeDecodeError:
continue
# CSVLoader uses DictReader, and one line per file is a header,
# so subtract the number of files.
row_count -= 1
loader = DirectoryLoader(
str(path), glob="**/*.csv", loader_cls=CSVLoader # type: ignore
)
loader_detect_encoding = DirectoryLoader(
str(path),
glob="**/*.csv",
loader_kwargs={"autodetect_encoding": True},
loader_cls=CSVLoader, # type: ignore
)
with pytest.raises((UnicodeDecodeError, RuntimeError)):
loader.load()
docs = loader_detect_encoding.load()
assert len(docs) == row_count
@pytest.mark.skip(reason="slow test") @pytest.mark.skip(reason="slow test")
@pytest.mark.requires("chardet") @pytest.mark.requires("chardet")
def test_loader_detect_encoding_timeout(tmpdir: str) -> None: def test_loader_detect_encoding_timeout(tmpdir: str) -> None:

View File

@ -0,0 +1,11 @@
行ID,製品名,顧客名,顧客ID,売上,価格,送料,都道府県,製品カテゴリ,割引
1,"Eldon スタッカブル収納棚用ベース、プラチナ",モハメド・マッキンタイア,3,-213.25,38.94,35,ヌナブット準州,保管と整理,0.8
2,"1.7立方フィートのコンパクト「キューブ」オフィス冷蔵庫",バリー・フレンチ,293,457.81,208.16,68.02,ヌナブット準州,家電製品,0.58
3,"Cardinal Slant-D? リング バインダー、ヘビーゲージ ビニール",バリー・フレンチ,293,46.71,8.69,2.99,ヌナブット準州,バインダーおよびバインダー付属品,0.39
4,"R380",クレイ・ロゼンダル,483,1198.97,195.99,3.99,ヌナブット準州,電話と通信,0.58
5,"ホームズ HEPA 空気清浄機",カルロス・ソルテロ,515,30.94,21.78,5.94,ヌナブット準州,家電製品,0.5
6,"GE 長寿命の屋内埋込型投光器電球",カルロス・ソルテロ,515,4.43,6.64,4.95,ヌナブット準州,オフィス家具,0.37
7,"ロックリング付きアングルDバインダー、ラベルホルダー",カール・ジャクソン,613,-54.04,7.3,7.72,ヌナブット準州,バインダーおよびバインダー付属品,0.38
8,"SAFCO モバイルデスクサイドファイル ワイヤーフレーム",カール・ジャクソン,613,127.70,42.76,6.22,ヌナブット準州,保管と整理,
9,"SAFCO 業務用ワイヤーシェルフ ブラック",モニカ・フェデル,643,-695.26,138.14,35,ヌナブット準州,保管と整理,
10,"ゼロックス 198",ドロシー・バッダーズ,678,-226.36,4.98,8.33,ヌナブット準州,紙,0.38
1 行ID 製品名 顧客名 顧客ID 売上 価格 送料 都道府県 製品カテゴリ 割引
2 1 Eldon スタッカブル収納棚用ベース、プラチナ モハメド・マッキンタイア 3 -213.25 38.94 35 ヌナブット準州 保管と整理 0.8
3 2 1.7立方フィートのコンパクト「キューブ」オフィス冷蔵庫 バリー・フレンチ 293 457.81 208.16 68.02 ヌナブット準州 家電製品 0.58
4 3 Cardinal Slant-D? リング バインダー、ヘビーゲージ ビニール バリー・フレンチ 293 46.71 8.69 2.99 ヌナブット準州 バインダーおよびバインダー付属品 0.39
5 4 R380 クレイ・ロゼンダル 483 1198.97 195.99 3.99 ヌナブット準州 電話と通信 0.58
6 5 ホームズ HEPA 空気清浄機 カルロス・ソルテロ 515 30.94 21.78 5.94 ヌナブット準州 家電製品 0.5
7 6 GE 長寿命の屋内埋込型投光器電球 カルロス・ソルテロ 515 4.43 6.64 4.95 ヌナブット準州 オフィス家具 0.37
8 7 ロックリング付きアングルDバインダー、ラベルホルダー カール・ジャクソン 613 -54.04 7.3 7.72 ヌナブット準州 バインダーおよびバインダー付属品 0.38
9 8 SAFCO モバイルデスクサイドファイル ワイヤーフレーム カール・ジャクソン 613 127.70 42.76 6.22 ヌナブット準州 保管と整理
10 9 SAFCO 業務用ワイヤーシェルフ ブラック モニカ・フェデル 643 -695.26 138.14 35 ヌナブット準州 保管と整理
11 10 ゼロックス 198 ドロシー・バッダーズ 678 -226.36 4.98 8.33 ヌナブット準州 0.38

View File

@ -0,0 +1,11 @@
"Row ID","Product Name","Customer Name","Customer ID","Sales","Price","Shipping Cost","Province","Product Category","Discount"
1,"Eldon Base for stackable storage shelf, platinum",Muhammed MacIntyre,3,-213.25,38.94,35,Nunavut,Storage & Organization,0.8
2,"1.7 Cubic Foot Compact ""Cube"" Office Refrigerators",Barry French,293,457.81,208.16,68.02,Nunavut,Appliances,0.58
3,"Cardinal Slant-D® Ring Binder, Heavy Gauge Vinyl",Barry French,293,46.71,8.69,2.99,Nunavut,Binders and Binder Accessories,0.39
4,R380,Clay Rozendal,483,1198.97,195.99,3.99,Nunavut,Telephones and Communication,0.58
5,Holmes HEPA Air Purifier,Carlos Soltero,515,30.94,21.78,5.94,Nunavut,Appliances,0.5
6,G.E. Longer-Life Indoor Recessed Floodlight Bulbs,Carlos Soltero,515,4.43,6.64,4.95,Nunavut,Office Furnishings,0.37
7,"Angle-D Binders with Locking Rings, Label Holders",Carl Jackson,613,-54.04,7.3,7.72,Nunavut,Binders and Binder Accessories,0.38
8,"SAFCO Mobile Desk Side File, Wire Frame",Carl Jackson,613,127.70,42.76,6.22,Nunavut,Storage & Organization,
9,"SAFCO Commercial Wire Shelving, Black",Monica Federle,643,-695.26,138.14,35,Nunavut,Storage & Organization,
10,Xerox 198,Dorothy Badders,678,-226.36,4.98,8.33,Nunavut,Paper,0.38
1 Row ID Product Name Customer Name Customer ID Sales Price Shipping Cost Province Product Category Discount
2 1 Eldon Base for stackable storage shelf, platinum Muhammed MacIntyre 3 -213.25 38.94 35 Nunavut Storage & Organization 0.8
3 2 1.7 Cubic Foot Compact "Cube" Office Refrigerators Barry French 293 457.81 208.16 68.02 Nunavut Appliances 0.58
4 3 Cardinal Slant-D® Ring Binder, Heavy Gauge Vinyl Barry French 293 46.71 8.69 2.99 Nunavut Binders and Binder Accessories 0.39
5 4 R380 Clay Rozendal 483 1198.97 195.99 3.99 Nunavut Telephones and Communication 0.58
6 5 Holmes HEPA Air Purifier Carlos Soltero 515 30.94 21.78 5.94 Nunavut Appliances 0.5
7 6 G.E. Longer-Life Indoor Recessed Floodlight Bulbs Carlos Soltero 515 4.43 6.64 4.95 Nunavut Office Furnishings 0.37
8 7 Angle-D Binders with Locking Rings, Label Holders Carl Jackson 613 -54.04 7.3 7.72 Nunavut Binders and Binder Accessories 0.38
9 8 SAFCO Mobile Desk Side File, Wire Frame Carl Jackson 613 127.70 42.76 6.22 Nunavut Storage & Organization
10 9 SAFCO Commercial Wire Shelving, Black Monica Federle 643 -695.26 138.14 35 Nunavut Storage & Organization
11 10 Xerox 198 Dorothy Badders 678 -226.36 4.98 8.33 Nunavut Paper 0.38