fix: table error fix

2026-01-14 04:07:28 +00:00 · 2025-10-20 13:48:40 +08:00
parent 2a823ee25c
commit e69a4e587f
1 changed files with 21 additions and 1 deletions
--- a/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py
+++ b/packages/dbgpt-serve/src/dbgpt_serve/evaluate/service/fetchdata/benchmark_data_manager.py
@@ -475,8 +475,13 @@ class BenchmarkDataManager(BaseComponent):
                                dialect = sniffer.sniff(sample_for_sniff)
                            except Exception:

+                                # Fallback: choose delimiter by counting common separators in header/data line
+                                delims = [",", "\t", ";", "|"]
+                                counts = {d: (header_line.count(d) if header_line else 0) + (data_line.count(d) if data_line else 0) for d in delims}
+                                best = max(counts, key=counts.get) if any(counts.values()) else ","
+
                                class _DefaultDialect(csv.Dialect):
-                                    delimiter = ","
+                                    delimiter = best
                                    quotechar = '"'
                                    doublequote = True
                                    skipinitialspace = False
@@ -501,6 +506,21 @@ class BenchmarkDataManager(BaseComponent):
                                else []
                            )

+                            # Heuristic: if has_header is False but header_row looks like names (mostly alphabetic), treat as header
+                            if not has_header:
+                                def _looks_like_header(tokens: List[str]) -> bool:
+                                    if not tokens:
+                                        return False
+                                    # 非空、重复少、字母比例高
+                                    cleaned = [str(t).strip() for t in tokens if str(t).strip()]
+                                    if not cleaned:
+                                        return False
+                                    # 允许少量数字，但大多以字母开头
+                                    alpha_starts = sum(1 for t in cleaned if t and (t[0].isalpha() or t[0] == '_'))
+                                    return alpha_starts >= max(1, int(0.6 * len(cleaned)))
+                                if _looks_like_header(header_row):
+                                    has_header = True
+
                            if not has_header:
                                num_cols_guess = len(header_row)
                                headers = [f"col_{i}" for i in range(num_cols_guess)]