Increase the accuracy of reading Excel (#1209)

2025-09-14 13:40:54 +00:00 · 2024-02-28 16:47:29 +08:00
parent 08fcb4f744
commit 0837da48ba
2 changed files with 46 additions and 9 deletions
--- a/dbgpt/app/scene/chat_data/chat_excel/excel_reader.py
+++ b/dbgpt/app/scene/chat_data/chat_excel/excel_reader.py
@@ -248,21 +248,42 @@ class ExcelReader:
                file_path,
                index_col=False,
                encoding=encoding,
+                # csv_colunm_foramt 可以修改更多，只是针对美元人民币符号，假如是“你好¥¥¥”则会报错！
                converters={i: csv_colunm_foramt for i in range(df_tmp.shape[1])},
            )
        else:
            raise ValueError("Unsupported file format.")

        self.df.replace("", np.nan, inplace=True)
+
+        # 修改的部分
+
+        unnamed_columns_tmp = [
+            col
+            for col in df_tmp.columns
+            if col.startswith("Unnamed") and df_tmp[col].isnull().all()
+        ]
+        df_tmp.drop(columns=unnamed_columns_tmp, inplace=True)
+
+        self.df = self.df[df_tmp.columns.values]
+        #
+
        self.columns_map = {}
        for column_name in df_tmp.columns:
+            self.df[column_name] = self.df[column_name].astype(str)
            self.columns_map.update({column_name: excel_colunm_format(column_name)})
            try:
-                if not pd.api.types.is_datetime64_ns_dtype(self.df[column_name]):
+                self.df[column_name] = pd.to_datetime(self.df[column_name]).dt.strftime(
+                    "%Y-%m-%d"
+                )
+            except ValueError:
+                try:
                    self.df[column_name] = pd.to_numeric(self.df[column_name])
-                self.df[column_name] = self.df[column_name].fillna(0)
-            except Exception as e:
-                print("can't transfor numeric column" + column_name)
+                except ValueError:
+                    try:
+                        self.df[column_name] = self.df[column_name].astype(str)
+                    except Exception:
+                        print("Can't transform column: " + column_name)

        self.df = self.df.rename(columns=lambda x: x.strip().replace(" ", "_"))