From e69a4e587fa938317de1e7382e37d060d11755d5 Mon Sep 17 00:00:00 2001 From: yaoyifan-yyf Date: Mon, 20 Oct 2025 13:48:40 +0800 Subject: [PATCH] fix: table error fix --- .../fetchdata/benchmark_data_manager.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py index 6c8c7e579..51aeaf1b8 100644 --- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py +++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py @@ -475,8 +475,13 @@ class BenchmarkDataManager(BaseComponent): dialect = sniffer.sniff(sample_for_sniff) except Exception: + # Fallback: choose delimiter by counting common separators in header/data line + delims = [",", "\t", ";", "|"] + counts = {d: (header_line.count(d) if header_line else 0) + (data_line.count(d) if data_line else 0) for d in delims} + best = max(counts, key=counts.get) if any(counts.values()) else "," + class _DefaultDialect(csv.Dialect): - delimiter = "," + delimiter = best quotechar = '"' doublequote = True skipinitialspace = False @@ -501,6 +506,21 @@ class BenchmarkDataManager(BaseComponent): else [] ) + # Heuristic: if has_header is False but header_row looks like names (mostly alphabetic), treat as header + if not has_header: + def _looks_like_header(tokens: List[str]) -> bool: + if not tokens: + return False + # 非空、重复少、字母比例高 + cleaned = [str(t).strip() for t in tokens if str(t).strip()] + if not cleaned: + return False + # 允许少量数字,但大多以字母开头 + alpha_starts = sum(1 for t in cleaned if t and (t[0].isalpha() or t[0] == '_')) + return alpha_starts >= max(1, int(0.6 * len(cleaned))) + if _looks_like_header(header_row): + has_header = True + if not has_header: num_cols_guess = len(header_row) headers = [f"col_{i}" for i in range(num_cols_guess)]