feat(editor): ChatExcel

🔥ChatExcel Mode Operation Manual
This commit is contained in:
yhjun1026
2023-08-29 20:24:51 +08:00
parent d8ca59d9e4
commit 237992e7fa
9 changed files with 159 additions and 16 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 783 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 366 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 110 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 138 KiB

View File

@@ -0,0 +1,26 @@
ChatExcel
==================================
ChatExcel uses natural language to analyze and query Excel data.![db plugins demonstration](../../../../assets/chat_excel/chat_excel_1.png)
### 1.Select And Upload Excel or CSV File
Select your excel or csv file to upload and start the conversation.
```{tip}
ChatExcel
The ChatExcel function supports Excel and CSV format files, select the corresponding file to use.
```
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_2.png)
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_3.png)
### 2.Wait for Data Processing
After the data is uploaded, it will first learn and process the data structure and field meaning.
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_4.png)
### 3.Use Data Analysis Calculation
Now you can use natural language to analyze and query data in the dialog box.
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_5.png)
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_6.png)
![db plugins demonstration](../../../../assets/chat_excel/chat_excel_7.png)

View File

@@ -3,38 +3,155 @@ import duckdb
import pandas as pd import pandas as pd
import matplotlib import matplotlib
import seaborn as sns import seaborn as sns
import uuid
from pandas import DataFrame
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib import font_manager
from matplotlib.font_manager import FontManager
matplotlib.use("Agg")
import time import time
from fsspec import filesystem from fsspec import filesystem
import spatial import spatial
from pilot.scene.chat_data.chat_excel.excel_reader import ExcelReader from pilot.scene.chat_data.chat_excel.excel_reader import ExcelReader
def data_pre_classification(df: DataFrame):
## Data pre-classification
columns = df.columns.tolist()
number_columns = []
non_numeric_colums = []
# 收集数据分类小于10个的列
non_numeric_colums_value_map = {}
numeric_colums_value_map = {}
for column_name in columns:
if pd.to_numeric(df[column_name], errors='coerce').notna().all():
number_columns.append(column_name)
unique_values = df[column_name].unique()
numeric_colums_value_map.update({column_name: len(unique_values)})
else:
non_numeric_colums.append(column_name)
unique_values = df[column_name].unique()
non_numeric_colums_value_map.update({column_name: len(unique_values)})
if len(non_numeric_colums) <=0:
sorted_colums_value_map = dict(sorted(numeric_colums_value_map.items(), key=lambda x: x[1]))
numeric_colums_sort_list = list(sorted_colums_value_map.keys())
x_column = number_columns[0]
hue_column = numeric_colums_sort_list[0]
y_column = numeric_colums_sort_list[1]
elif len(number_columns) <=0:
raise ValueError("Have No numeric Column")
else:
# 数字和非数字都存在多列,放弃部分数字列
y_column = number_columns[0]
x_column = non_numeric_colums[0]
# if len(non_numeric_colums) > 1:
#
# else:
# non_numeric_colums_sort_list.remove(non_numeric_colums[0])
# hue_column = non_numeric_colums_sort_list
return x_column, y_column, hue_column
if __name__ == "__main__": if __name__ == "__main__":
# connect = duckdb.connect("/Users/tuyang.yhj/Downloads/example.xlsx") # connect = duckdb.connect("/Users/tuyang.yhj/Downloads/example.xlsx")
#
# fonts = fm.findSystemFonts()
# for font in fonts:
# if 'Hei' in font:
# print(font)
# fm = FontManager()
# mat_fonts = set(f.name for f in fm.ttflist)
# for i in mat_fonts:
# print(i)
# print(len(mat_fonts))
# 获取系统中的默认中文字体名称
# default_font = fm.fontManager.defaultFontProperties.get_family()
# #
excel_reader = ExcelReader("/Users/tuyang.yhj/Downloads/example.xlsx") excel_reader = ExcelReader("/Users/tuyang.yhj/Downloads/example.xlsx")
#
# # colunms, datas = excel_reader.run( "SELECT CONCAT(Year, '-', Quarter) AS QuarterYear, SUM(Sales) AS TotalSales FROM example GROUP BY QuarterYear ORDER BY QuarterYear")
# # colunms, datas = excel_reader.run( """ SELECT Year, SUM(Sales) AS Total_Sales FROM example GROUP BY Year ORDER BY Year; """)
df = excel_reader.get_df_by_sql_ex(""" SELECT Segment, Country, SUM(Sales) AS Total_Sales, SUM(Profit) AS Total_Profit FROM example GROUP BY Segment, Country """)
x,y,hue =data_pre_classification(df)
print(x, y, hue)
# colunms, datas = excel_reader.run( "SELECT CONCAT(Year, '-', Quarter) AS QuarterYear, SUM(Sales) AS TotalSales FROM example GROUP BY QuarterYear ORDER BY QuarterYear")
colunms, datas = excel_reader.run( """ SELECT Year, SUM(Sales) AS Total_Sales FROM example GROUP BY Year ORDER BY Year; """)
df = excel_reader.get_df_by_sql_ex("SELECT Country, SUM(Profit) AS Total_Profit FROM example GROUP BY Country;")
columns = df.columns.tolist() columns = df.columns.tolist()
plt.rcParams["font.family"] = ["sans-serif"] font_names = ['Heiti TC', 'Songti SC', 'STHeiti Light', 'Microsoft YaHei', 'SimSun', 'SimHei', 'KaiTi']
rc = {"font.sans-serif": "SimHei", "axes.unicode_minus": False} fm = FontManager()
sns.set_style(rc={'font.sans-serif': "Microsoft Yahei"}) mat_fonts = set(f.name for f in fm.ttflist)
sns.set(context="notebook", style="ticks", color_codes=True, rc=rc) can_use_fonts = []
sns.set_palette("Set3") # 设置颜色主题 for font_name in font_names:
if font_name in mat_fonts:
can_use_fonts.append(font_name)
if len(can_use_fonts) > 0:
plt.rcParams['font.sans-serif'] = can_use_fonts
rc = {'font.sans-serif': can_use_fonts}
plt.rcParams['axes.unicode_minus'] = False # 解决无法显示符号的问题
sns.set(font='Heiti TC', font_scale=0.8) # 解决Seaborn中文显示问题
sns.set_palette("Set3") # 设置颜色主题
sns.set_style("dark")
sns.color_palette("hls", 10)
sns.hls_palette(8, l=.5, s=.7)
sns.set(context='notebook', style='ticks', rc=rc)
# sns.set_palette("Set3") # 设置颜色主题
# sns.set_style("dark")
# sns.color_palette("hls", 10)
# sns.hls_palette(8, l=.5, s=.7)
# sns.set(context='notebook', style='ticks', rc=rc)
# fig, ax = plt.pie(df[columns[1]], labels=df[columns[0]], autopct='%1.1f%%', startangle=90)
fig, ax = plt.subplots(figsize=(8, 5), dpi=100) fig, ax = plt.subplots(figsize=(8, 5), dpi=100)
plt.subplots_adjust(top=0.9) # plt.ticklabel_format(style='plain')
ax = df.plot(kind='pie', y=columns[1], ax=ax, labels=df[columns[0]].values, startangle=90, autopct='%1.1f%%') # ax = df.plot(kind='bar', ax=ax)
# 手动设置 labels 的位置和大小 # sns.barplot(df, x=x, y=y, hue= "Country", ax=ax)
ax.legend(loc='center left', bbox_to_anchor=(-1, 0.5, 0,0), labels=None, fontsize=10) sns.catplot(data=df, x=x, y=y, hue='Country', kind='bar')
plt.axis('equal') # 使饼图为正圆形 # 设置 y 轴刻度格式为普通数字格式
plt.show() ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
# fonts = font_manager.findSystemFonts()
# font_path = ""
# for font in fonts:
# if "Heiti" in font:
# font_path = font
# my_font = font_manager.FontProperties(fname=font_path)
# plt.title("测试", fontproperties=my_font)
# plt.ylabel(columns[1], fontproperties=my_font)
# plt.xlabel(columns[0], fontproperties=my_font)
chart_name = "bar_" + str(uuid.uuid1()) + ".png"
chart_path = chart_name
plt.savefig(chart_path, bbox_inches='tight', dpi=100)
# sns.set(context="notebook", style="ticks", color_codes=True)
# sns.set_palette("Set3") # 设置颜色主题
#
# # fig, ax = plt.pie(df[columns[1]], labels=df[columns[0]], autopct='%1.1f%%', startangle=90)
# fig, ax = plt.subplots(figsize=(8, 5), dpi=100)
# plt.subplots_adjust(top=0.9)
# ax = df.plot(kind='pie', y=columns[1], ax=ax, labels=df[columns[0]].values, startangle=90, autopct='%1.1f%%')
# # 手动设置 labels 的位置和大小
# ax.legend(loc='center left', bbox_to_anchor=(-1, 0.5, 0,0), labels=None, fontsize=10)
# plt.axis('equal') # 使饼图为正圆形
# plt.show()
# #
# #
# def csv_colunm_foramt(val): # def csv_colunm_foramt(val):