feat(editor): ChatExcel

🔥ChatExcel Mode Operation Manual
This commit is contained in:
yhjun1026 2023-08-29 20:24:51 +08:00
parent d8ca59d9e4
commit 237992e7fa
9 changed files with 159 additions and 16 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 783 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 366 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 110 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 138 KiB

View File

@ -0,0 +1,26 @@
ChatExcel
==================================
ChatExcel uses natural language to analyze and query Excel data.![db plugins demonstration](../../../../assets/chat_excel/chat_excel_1.png)
### 1.Select And Upload Excel or CSV File
Select your excel or csv file to upload and start the conversation.
```{tip}
ChatExcel
The ChatExcel function supports Excel and CSV format files, select the corresponding file to use.
```
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_2.png)
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_3.png)
### 2.Wait for Data Processing
After the data is uploaded, it will first learn and process the data structure and field meaning.
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_4.png)
### 3.Use Data Analysis Calculation
Now you can use natural language to analyze and query data in the dialog box.
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_5.png)
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_6.png)
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_7.png)

View File

@ -3,38 +3,155 @@ import duckdb
import pandas as pd
import matplotlib
import seaborn as sns
import uuid
from pandas import DataFrame
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib import font_manager
from matplotlib.font_manager import FontManager
matplotlib.use("Agg")
import time
from fsspec import filesystem
import spatial
import spatial
from pilot.scene.chat_data.chat_excel.excel_reader import ExcelReader
def data_pre_classification(df: DataFrame):
## Data pre-classification
columns = df.columns.tolist()
number_columns = []
non_numeric_colums = []
# 收集数据分类小于10个的列
non_numeric_colums_value_map = {}
numeric_colums_value_map = {}
for column_name in columns:
if pd.to_numeric(df[column_name], errors='coerce').notna().all():
number_columns.append(column_name)
unique_values = df[column_name].unique()
numeric_colums_value_map.update({column_name: len(unique_values)})
else:
non_numeric_colums.append(column_name)
unique_values = df[column_name].unique()
non_numeric_colums_value_map.update({column_name: len(unique_values)})
if len(non_numeric_colums) <=0:
sorted_colums_value_map = dict(sorted(numeric_colums_value_map.items(), key=lambda x: x[1]))
numeric_colums_sort_list = list(sorted_colums_value_map.keys())
x_column = number_columns[0]
hue_column = numeric_colums_sort_list[0]
y_column = numeric_colums_sort_list[1]
elif len(number_columns) <=0:
raise ValueError("Have No numeric Column")
else:
# 数字和非数字都存在多列,放弃部分数字列
y_column = number_columns[0]
x_column = non_numeric_colums[0]
# if len(non_numeric_colums) > 1:
#
# else:
# non_numeric_colums_sort_list.remove(non_numeric_colums[0])
# hue_column = non_numeric_colums_sort_list
return x_column, y_column, hue_column
if __name__ == "__main__":
# connect = duckdb.connect("/Users/tuyang.yhj/Downloads/example.xlsx")
#
# fonts = fm.findSystemFonts()
# for font in fonts:
# if 'Hei' in font:
# print(font)
# fm = FontManager()
# mat_fonts = set(f.name for f in fm.ttflist)
# for i in mat_fonts:
# print(i)
# print(len(mat_fonts))
# 获取系统中的默认中文字体名称
# default_font = fm.fontManager.defaultFontProperties.get_family()
#
excel_reader = ExcelReader("/Users/tuyang.yhj/Downloads/example.xlsx")
#
# # colunms, datas = excel_reader.run( "SELECT CONCAT(Year, '-', Quarter) AS QuarterYear, SUM(Sales) AS TotalSales FROM example GROUP BY QuarterYear ORDER BY QuarterYear")
# # colunms, datas = excel_reader.run( """ SELECT Year, SUM(Sales) AS Total_Sales FROM example GROUP BY Year ORDER BY Year; """)
df = excel_reader.get_df_by_sql_ex(""" SELECT Segment, Country, SUM(Sales) AS Total_Sales, SUM(Profit) AS Total_Profit FROM example GROUP BY Segment, Country """)
x,y,hue =data_pre_classification(df)
print(x, y, hue)
# colunms, datas = excel_reader.run( "SELECT CONCAT(Year, '-', Quarter) AS QuarterYear, SUM(Sales) AS TotalSales FROM example GROUP BY QuarterYear ORDER BY QuarterYear")
colunms, datas = excel_reader.run( """ SELECT Year, SUM(Sales) AS Total_Sales FROM example GROUP BY Year ORDER BY Year; """)
df = excel_reader.get_df_by_sql_ex("SELECT Country, SUM(Profit) AS Total_Profit FROM example GROUP BY Country;")
columns = df.columns.tolist()
plt.rcParams["font.family"] = ["sans-serif"]
rc = {"font.sans-serif": "SimHei", "axes.unicode_minus": False}
sns.set_style(rc={'font.sans-serif': "Microsoft Yahei"})
sns.set(context="notebook", style="ticks", color_codes=True, rc=rc)
sns.set_palette("Set3") # 设置颜色主题
font_names = ['Heiti TC', 'Songti SC', 'STHeiti Light', 'Microsoft YaHei', 'SimSun', 'SimHei', 'KaiTi']
fm = FontManager()
mat_fonts = set(f.name for f in fm.ttflist)
can_use_fonts = []
for font_name in font_names:
if font_name in mat_fonts:
can_use_fonts.append(font_name)
if len(can_use_fonts) > 0:
plt.rcParams['font.sans-serif'] = can_use_fonts
rc = {'font.sans-serif': can_use_fonts}
plt.rcParams['axes.unicode_minus'] = False # 解决无法显示符号的问题
sns.set(font='Heiti TC', font_scale=0.8) # 解决Seaborn中文显示问题
sns.set_palette("Set3") # 设置颜色主题
sns.set_style("dark")
sns.color_palette("hls", 10)
sns.hls_palette(8, l=.5, s=.7)
sns.set(context='notebook', style='ticks', rc=rc)
# sns.set_palette("Set3") # 设置颜色主题
# sns.set_style("dark")
# sns.color_palette("hls", 10)
# sns.hls_palette(8, l=.5, s=.7)
# sns.set(context='notebook', style='ticks', rc=rc)
# fig, ax = plt.pie(df[columns[1]], labels=df[columns[0]], autopct='%1.1f%%', startangle=90)
fig, ax = plt.subplots(figsize=(8, 5), dpi=100)
plt.subplots_adjust(top=0.9)
ax = df.plot(kind='pie', y=columns[1], ax=ax, labels=df[columns[0]].values, startangle=90, autopct='%1.1f%%')
# 手动设置 labels 的位置和大小
ax.legend(loc='center left', bbox_to_anchor=(-1, 0.5, 0,0), labels=None, fontsize=10)
plt.axis('equal') # 使饼图为正圆形
plt.show()
# plt.ticklabel_format(style='plain')
# ax = df.plot(kind='bar', ax=ax)
# sns.barplot(df, x=x, y=y, hue= "Country", ax=ax)
sns.catplot(data=df, x=x, y=y, hue='Country', kind='bar')
# 设置 y 轴刻度格式为普通数字格式
ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
# fonts = font_manager.findSystemFonts()
# font_path = ""
# for font in fonts:
# if "Heiti" in font:
# font_path = font
# my_font = font_manager.FontProperties(fname=font_path)
# plt.title("测试", fontproperties=my_font)
# plt.ylabel(columns[1], fontproperties=my_font)
# plt.xlabel(columns[0], fontproperties=my_font)
chart_name = "bar_" + str(uuid.uuid1()) + ".png"
chart_path = chart_name
plt.savefig(chart_path, bbox_inches='tight', dpi=100)
# sns.set(context="notebook", style="ticks", color_codes=True)
# sns.set_palette("Set3") # 设置颜色主题
#
# # fig, ax = plt.pie(df[columns[1]], labels=df[columns[0]], autopct='%1.1f%%', startangle=90)
# fig, ax = plt.subplots(figsize=(8, 5), dpi=100)
# plt.subplots_adjust(top=0.9)
# ax = df.plot(kind='pie', y=columns[1], ax=ax, labels=df[columns[0]].values, startangle=90, autopct='%1.1f%%')
# # 手动设置 labels 的位置和大小
# ax.legend(loc='center left', bbox_to_anchor=(-1, 0.5, 0,0), labels=None, fontsize=10)
# plt.axis('equal') # 使饼图为正圆形
# plt.show()
#
#
# def csv_colunm_foramt(val):