mirror of
https://github.com/csunny/DB-GPT.git
synced 2026-01-14 04:07:28 +00:00
fix: table error fix
This commit is contained in:
@@ -475,8 +475,13 @@ class BenchmarkDataManager(BaseComponent):
|
||||
dialect = sniffer.sniff(sample_for_sniff)
|
||||
except Exception:
|
||||
|
||||
# Fallback: choose delimiter by counting common separators in header/data line
|
||||
delims = [",", "\t", ";", "|"]
|
||||
counts = {d: (header_line.count(d) if header_line else 0) + (data_line.count(d) if data_line else 0) for d in delims}
|
||||
best = max(counts, key=counts.get) if any(counts.values()) else ","
|
||||
|
||||
class _DefaultDialect(csv.Dialect):
|
||||
delimiter = ","
|
||||
delimiter = best
|
||||
quotechar = '"'
|
||||
doublequote = True
|
||||
skipinitialspace = False
|
||||
@@ -501,6 +506,21 @@ class BenchmarkDataManager(BaseComponent):
|
||||
else []
|
||||
)
|
||||
|
||||
# Heuristic: if has_header is False but header_row looks like names (mostly alphabetic), treat as header
|
||||
if not has_header:
|
||||
def _looks_like_header(tokens: List[str]) -> bool:
|
||||
if not tokens:
|
||||
return False
|
||||
# 非空、重复少、字母比例高
|
||||
cleaned = [str(t).strip() for t in tokens if str(t).strip()]
|
||||
if not cleaned:
|
||||
return False
|
||||
# 允许少量数字,但大多以字母开头
|
||||
alpha_starts = sum(1 for t in cleaned if t and (t[0].isalpha() or t[0] == '_'))
|
||||
return alpha_starts >= max(1, int(0.6 * len(cleaned)))
|
||||
if _looks_like_header(header_row):
|
||||
has_header = True
|
||||
|
||||
if not has_header:
|
||||
num_cols_guess = len(header_row)
|
||||
headers = [f"col_{i}" for i in range(num_cols_guess)]
|
||||
|
||||
Reference in New Issue
Block a user