From dba7c2a8973dc9398157a5eff401605cc0e5d3df Mon Sep 17 00:00:00 2001 From: csunny Date: Tue, 21 Nov 2023 10:02:52 +0800 Subject: [PATCH] chores: extra code clean --- examples/app.py | 74 ----- examples/embdserver.py | 82 ------ examples/gpt_index.py | 19 -- examples/gradio_test.py | 21 -- .../knowledge_embedding/csv_embedding_test.py | 18 -- .../knowledge_embedding/pdf_embedding_test.py | 18 -- .../knowledge_embedding/url_embedding_test.py | 17 -- examples/proxy_example.py | 67 ----- examples/t5_example.py | 257 ------------------ 9 files changed, 573 deletions(-) delete mode 100644 examples/app.py delete mode 100644 examples/embdserver.py delete mode 100644 examples/gpt_index.py delete mode 100644 examples/gradio_test.py delete mode 100644 examples/knowledge_embedding/csv_embedding_test.py delete mode 100644 examples/knowledge_embedding/pdf_embedding_test.py delete mode 100644 examples/knowledge_embedding/url_embedding_test.py delete mode 100644 examples/proxy_example.py delete mode 100644 examples/t5_example.py diff --git a/examples/app.py b/examples/app.py deleted file mode 100644 index 07f8a5a51..000000000 --- a/examples/app.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding:utf-8 -*- - -import gradio as gr -from langchain.agents import AgentType, initialize_agent, load_tools -from llama_index import ( - Document, - GPTVectorStoreIndex, - LangchainEmbedding, - LLMPredictor, - ServiceContext, -) - -from pilot.model.llm_out.vicuna_llm import VicunaEmbeddingLLM, VicunaRequestLLM - - -def agent_demo(): - llm = VicunaRequestLLM() - - tools = load_tools(["python_repl"], llm=llm) - agent = initialize_agent( - tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=True - ) - agent.run("Write a SQL script that Query 'select count(1)!'") - - -def knowledged_qa_demo(text_list): - llm_predictor = LLMPredictor(llm=VicunaRequestLLM()) - hfemb = VicunaEmbeddingLLM() - embed_model = LangchainEmbedding(hfemb) - documents = [Document(t) for t in text_list] - - service_context = ServiceContext.from_defaults( - llm_predictor=llm_predictor, embed_model=embed_model - ) - index = GPTVectorStoreIndex.from_documents( - documents, service_context=service_context - ) - return index - - -def get_answer(q): - base_knowledge = """ """ - text_list = [base_knowledge] - index = knowledged_qa_demo(text_list) - response = index.query(q) - return response.response - - -def get_similar(q): - from pilot.vector_store.extract_tovec import knownledge_tovec_st - - docsearch = knownledge_tovec_st("./datasets/plan.md") - docs = docsearch.similarity_search_with_score(q, k=1) - - for doc in docs: - dc, s = doc - print(s) - yield dc.page_content - - -if __name__ == "__main__": - # agent_demo() - - with gr.Blocks() as demo: - gr.Markdown("数据库智能助手") - with gr.Tab("知识问答"): - text_input = gr.TextArea() - text_output = gr.TextArea() - text_button = gr.Button() - - text_button.click(get_similar, inputs=text_input, outputs=text_output) - - demo.queue(concurrency_count=3).launch(server_name="0.0.0.0") diff --git a/examples/embdserver.py b/examples/embdserver.py deleted file mode 100644 index ae0dfcae8..000000000 --- a/examples/embdserver.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding:utf-8 -*- - -import json -import os -import sys -from urllib.parse import urljoin - -import gradio as gr -import requests - -ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(ROOT_PATH) - - -from langchain.prompts import PromptTemplate - -from pilot.configs.config import Config -from pilot.conversation import conv_qa_prompt_template, conv_templates - -llmstream_stream_path = "generate_stream" - -CFG = Config() - - -def generate(query): - template_name = "conv_one_shot" - state = conv_templates[template_name].copy() - - # pt = PromptTemplate( - # template=conv_qa_prompt_template, - # input_variables=["context", "question"] - # ) - - # result = pt.format(context="This page covers how to use the Chroma ecosystem within LangChain. It is broken into two parts: installation and setup, and then references to specific Chroma wrappers.", - # question=query) - - # print(result) - - state.append_message(state.roles[0], query) - state.append_message(state.roles[1], None) - - prompt = state.get_prompt() - params = { - "model": "chatglm-6b", - "prompt": prompt, - "temperature": 1.0, - "max_new_tokens": 1024, - "stop": "###", - } - - response = requests.post( - url=urljoin(CFG.MODEL_SERVER, llmstream_stream_path), data=json.dumps(params) - ) - - skip_echo_len = len(params["prompt"]) + 1 - params["prompt"].count("") * 3 - - for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"): - if chunk: - data = json.loads(chunk.decode()) - if data["error_code"] == 0: - if "vicuna" in CFG.LLM_MODEL: - output = data["text"][skip_echo_len:].strip() - else: - output = data["text"].strip() - - state.messages[-1][-1] = output + "▌" - yield (output) - - -if __name__ == "__main__": - print(CFG.LLM_MODEL) - with gr.Blocks() as demo: - gr.Markdown("数据库SQL生成助手") - with gr.Tab("SQL生成"): - text_input = gr.TextArea() - text_output = gr.TextArea() - text_button = gr.Button("提交") - - text_button.click(generate, inputs=text_input, outputs=text_output) - - demo.queue(concurrency_count=3).launch(server_name="0.0.0.0") diff --git a/examples/gpt_index.py b/examples/gpt_index.py deleted file mode 100644 index a4683af1a..000000000 --- a/examples/gpt_index.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import logging -import sys - -from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader - -logging.basicConfig(stream=sys.stdout, level=logging.INFO) -logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) - -# read the document of data dir -documents = SimpleDirectoryReader("data").load_data() -# split the document to chunk, max token size=500, convert chunk to vector - -index = GPTVectorStoreIndex(documents) - -# save index -index.save_to_disk("index.json") diff --git a/examples/gradio_test.py b/examples/gradio_test.py deleted file mode 100644 index 593c6c1f4..000000000 --- a/examples/gradio_test.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding:utf-8 -*- - -import gradio as gr - - -def change_tab(): - return gr.Tabs.update(selected=1) - - -with gr.Blocks() as demo: - with gr.Tabs() as tabs: - with gr.TabItem("Train", id=0): - t = gr.Textbox() - with gr.TabItem("Inference", id=1): - i = gr.Image() - - btn = gr.Button() - btn.click(change_tab, None, tabs) - -demo.launch() diff --git a/examples/knowledge_embedding/csv_embedding_test.py b/examples/knowledge_embedding/csv_embedding_test.py deleted file mode 100644 index dcf4873b2..000000000 --- a/examples/knowledge_embedding/csv_embedding_test.py +++ /dev/null @@ -1,18 +0,0 @@ -from pilot.embedding_engine.csv_embedding import CSVEmbedding - -# path = "/Users/chenketing/Downloads/share_ireserve双写数据异常2.xlsx" -path = "xx.csv" -model_name = "your_path/all-MiniLM-L6-v2" -vector_store_path = "your_path/" - - -pdf_embedding = CSVEmbedding( - file_path=path, - model_name=model_name, - vector_store_config={ - "vector_store_name": "url", - "vector_store_path": "vector_store_path", - }, -) -pdf_embedding.source_embedding() -print("success") diff --git a/examples/knowledge_embedding/pdf_embedding_test.py b/examples/knowledge_embedding/pdf_embedding_test.py deleted file mode 100644 index ef0e1d87e..000000000 --- a/examples/knowledge_embedding/pdf_embedding_test.py +++ /dev/null @@ -1,18 +0,0 @@ -from pilot.embedding_engine.pdf_embedding import PDFEmbedding - -path = "xxx.pdf" -path = "your_path/OceanBase-数据库-V4.1.0-应用开发.pdf" -model_name = "your_path/all-MiniLM-L6-v2" -vector_store_path = "your_path/" - - -pdf_embedding = PDFEmbedding( - file_path=path, - model_name=model_name, - vector_store_config={ - "vector_store_name": "ob-pdf", - "vector_store_path": vector_store_path, - }, -) -pdf_embedding.source_embedding() -print("success") diff --git a/examples/knowledge_embedding/url_embedding_test.py b/examples/knowledge_embedding/url_embedding_test.py deleted file mode 100644 index c702fd1f7..000000000 --- a/examples/knowledge_embedding/url_embedding_test.py +++ /dev/null @@ -1,17 +0,0 @@ -from pilot.embedding_engine.url_embedding import URLEmbedding - -path = "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023" -model_name = "your_path/all-MiniLM-L6-v2" -vector_store_path = "your_path" - - -pdf_embedding = URLEmbedding( - file_path=path, - model_name=model_name, - vector_store_config={ - "vector_store_name": "url", - "vector_store_path": "vector_store_path", - }, -) -pdf_embedding.source_embedding() -print("success") diff --git a/examples/proxy_example.py b/examples/proxy_example.py deleted file mode 100644 index a3d2f3bc4..000000000 --- a/examples/proxy_example.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import dashscope -import requests -import hashlib -from http import HTTPStatus -from dashscope import Generation - - -def call_with_messages(): - messages = [ - {"role": "system", "content": "你是生活助手机器人。"}, - {"role": "user", "content": "如何做西红柿鸡蛋?"}, - ] - gen = Generation() - response = gen.call( - Generation.Models.qwen_turbo, - messages=messages, - stream=True, - top_p=0.8, - result_format="message", # set the result to be "message" format. - ) - - for response in response: - # The response status_code is HTTPStatus.OK indicate success, - # otherwise indicate request is failed, you can get error code - # and message from code and message. - if response.status_code == HTTPStatus.OK: - print(response.output) # The output text - print(response.usage) # The usage information - else: - print(response.code) # The error code. - print(response.message) # The error message. - - -def build_access_token(api_key: str, secret_key: str) -> str: - """ - Generate Access token according AK, SK - """ - - url = "https://aip.baidubce.com/oauth/2.0/token" - params = { - "grant_type": "client_credentials", - "client_id": api_key, - "client_secret": secret_key, - } - - res = requests.get(url=url, params=params) - - if res.status_code == 200: - return res.json().get("access_token") - - -def _calculate_md5(text: str) -> str: - md5 = hashlib.md5() - md5.update(text.encode("utf-8")) - encrypted = md5.hexdigest() - return encrypted - - -def baichuan_call(): - url = "https://api.baichuan-ai.com/v1/stream/chat" - - -if __name__ == "__main__": - call_with_messages() diff --git a/examples/t5_example.py b/examples/t5_example.py deleted file mode 100644 index b49e79d4e..000000000 --- a/examples/t5_example.py +++ /dev/null @@ -1,257 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import torch -from langchain.embeddings.huggingface import HuggingFaceEmbeddings -from langchain.llms.base import LLM -from llama_index import ( - GPTListIndex, - GPTVectorStoreIndex, - LangchainEmbedding, - LLMPredictor, - PromptHelper, - SimpleDirectoryReader, -) -from transformers import pipeline - - -class FlanLLM(LLM): - model_name = "google/flan-t5-large" - pipeline = pipeline( - "text2text-generation", - model=model_name, - device=0, - model_kwargs={"torch_dtype": torch.bfloat16}, - ) - - def _call(self, prompt, stop=None): - return self.pipeline(prompt, max_length=9999)[0]["generated_text"] - - def _identifying_params(self): - return {"name_of_model": self.model_name} - - def _llm_type(self): - return "custome" - - -llm_predictor = LLMPredictor(llm=FlanLLM()) -hfemb = HuggingFaceEmbeddings() -embed_model = LangchainEmbedding(hfemb) - -text1 = """ - 执行计划是对一条 SQL 查询语句在数据库中执行过程的描述。用户可以通过 EXPLAIN 命令查看优化器针对指定 SQL 生成的逻辑执行计划。 - -如果要分析某条 SQL 的性能问题,通常需要先查看 SQL 的执行计划,排查每一步 SQL 执行是否存在问题。所以读懂执行计划是 SQL 优化的先决条件,而了解执行计划的算子是理解 EXPLAIN 命令的关键。 - -OceanBase 数据库的执行计划命令有三种模式:EXPLAIN BASIC、EXPLAIN 和 EXPLAIN EXTENDED。这三种模式对执行计划展现不同粒度的细节信息: - -EXPLAIN BASIC 命令用于最基本的计划展示。 - -EXPLAIN EXTENDED 命令用于最详细的计划展示(通常在排查问题时使用这种展示模式)。 - -EXPLAIN 命令所展示的信息可以帮助普通用户了解整个计划的执行方式。 - -EXPLAIN 命令格式如下: -EXPLAIN [BASIC | EXTENDED | PARTITIONS | FORMAT = format_name] [PRETTY | PRETTY_COLOR] explainable_stmt -format_name: - { TRADITIONAL | JSON } -explainable_stmt: - { SELECT statement - | DELETE statement - | INSERT statement - | REPLACE statement - | UPDATE statement } - - -EXPLAIN 命令适用于 SELECT、DELETE、INSERT、REPLACE 和 UPDATE 语句,显示优化器所提供的有关语句执行计划的信息,包括如何处理该语句,如何联接表以及以何种顺序联接表等信息。 - -一般来说,可以使用 EXPLAIN EXTENDED 命令,将表扫描的范围段展示出来。使用 EXPLAIN OUTLINE 命令可以显示 Outline 信息。 - -FORMAT 选项可用于选择输出格式。TRADITIONAL 表示以表格格式显示输出,这也是默认设置。JSON 表示以 JSON 格式显示信息。 - -使用 EXPLAIN PARTITITIONS 也可用于检查涉及分区表的查询。如果检查针对非分区表的查询,则不会产生错误,但 PARTIONS 列的值始终为 NULL。 - -对于复杂的执行计划,可以使用 PRETTY 或者 PRETTY_COLOR 选项将计划树中的父节点和子节点使用树线或彩色树线连接起来,使得执行计划展示更方便阅读。示例如下: -obclient> CREATE TABLE p1table(c1 INT ,c2 INT) PARTITION BY HASH(c1) PARTITIONS 2; -Query OK, 0 rows affected - -obclient> CREATE TABLE p2table(c1 INT ,c2 INT) PARTITION BY HASH(c1) PARTITIONS 4; -Query OK, 0 rows affected - -obclient> EXPLAIN EXTENDED PRETTY_COLOR SELECT * FROM p1table p1 JOIN p2table p2 ON p1.c1=p2.c2\G -*************************** 1. row *************************** -Query Plan: ========================================================== -|ID|OPERATOR |NAME |EST. ROWS|COST| ----------------------------------------------------------- -|0 |PX COORDINATOR | |1 |278 | -|1 | EXCHANGE OUT DISTR |:EX10001|1 |277 | -|2 | HASH JOIN | |1 |276 | -|3 | ├PX PARTITION ITERATOR | |1 |92 | -|4 | │ TABLE SCAN |P1 |1 |92 | -|5 | └EXCHANGE IN DISTR | |1 |184 | -|6 | EXCHANGE OUT DISTR (PKEY)|:EX10000|1 |184 | -|7 | PX PARTITION ITERATOR | |1 |183 | -|8 | TABLE SCAN |P2 |1 |183 | -========================================================== - -Outputs & filters: -------------------------------------- - 0 - output([INTERNAL_FUNCTION(P1.C1, P1.C2, P2.C1, P2.C2)]), filter(nil) - 1 - output([INTERNAL_FUNCTION(P1.C1, P1.C2, P2.C1, P2.C2)]), filter(nil), dop=1 - 2 - output([P1.C1], [P2.C2], [P1.C2], [P2.C1]), filter(nil), - equal_conds([P1.C1 = P2.C2]), other_conds(nil) - 3 - output([P1.C1], [P1.C2]), filter(nil) - 4 - output([P1.C1], [P1.C2]), filter(nil), - access([P1.C1], [P1.C2]), partitions(p[0-1]) - 5 - output([P2.C2], [P2.C1]), filter(nil) - 6 - (#keys=1, [P2.C2]), output([P2.C2], [P2.C1]), filter(nil), dop=1 - 7 - output([P2.C1], [P2.C2]), filter(nil) - 8 - output([P2.C1], [P2.C2]), filter(nil), - access([P2.C1], [P2.C2]), partitions(p[0-3]) - -1 row in set - - - - -## 执行计划形状与算子信息 - -在数据库系统中,执行计划在内部通常是以树的形式来表示的,但是不同的数据库会选择不同的方式展示给用户。 - -如下示例分别为 PostgreSQL 数据库、Oracle 数据库和 OceanBase 数据库对于 TPCDS Q3 的计划展示。 - -```sql -obclient> SELECT /*TPC-DS Q3*/ * - FROM (SELECT dt.d_year, - item.i_brand_id brand_id, - item.i_brand brand, - Sum(ss_net_profit) sum_agg - FROM date_dim dt, - store_sales, - item - WHERE dt.d_date_sk = store_sales.ss_sold_date_sk - AND store_sales.ss_item_sk = item.i_item_sk - AND item.i_manufact_id = 914 - AND dt.d_moy = 11 - GROUP BY dt.d_year, - item.i_brand, - item.i_brand_id - ORDER BY dt.d_year, - sum_agg DESC, - brand_id) - WHERE ROWNUM <= 100; - -PostgreSQL 数据库执行计划展示如下: -Limit (cost=13986.86..13987.20 rows=27 width=91) - Sort (cost=13986.86..13986.93 rows=27 width=65) - Sort Key: dt.d_year, (sum(store_sales.ss_net_profit)), item.i_brand_id - HashAggregate (cost=13985.95..13986.22 rows=27 width=65) - Merge Join (cost=13884.21..13983.91 rows=204 width=65) - Merge Cond: (dt.d_date_sk = store_sales.ss_sold_date_sk) - Index Scan using date_dim_pkey on date_dim dt (cost=0.00..3494.62 rows=6080 width=8) - Filter: (d_moy = 11) - Sort (cost=12170.87..12177.27 rows=2560 width=65) - Sort Key: store_sales.ss_sold_date_sk - Nested Loop (cost=6.02..12025.94 rows=2560 width=65) - Seq Scan on item (cost=0.00..1455.00 rows=16 width=59) - Filter: (i_manufact_id = 914) - Bitmap Heap Scan on store_sales (cost=6.02..658.94 rows=174 width=14) - Recheck Cond: (ss_item_sk = item.i_item_sk) - Bitmap Index Scan on store_sales_pkey (cost=0.00..5.97 rows=174 width=0) - Index Cond: (ss_item_sk = item.i_item_sk) - - - -Oracle 数据库执行计划展示如下: -Plan hash value: 2331821367 --------------------------------------------------------------------------------------------------- -| Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time | --------------------------------------------------------------------------------------------------- -| 0 | SELECT STATEMENT | | 100 | 9100 | 3688 (1)| 00:00:01 | -|* 1 | COUNT STOPKEY | | | | | | -| 2 | VIEW | | 2736 | 243K| 3688 (1)| 00:00:01 | -|* 3 | SORT ORDER BY STOPKEY | | 2736 | 256K| 3688 (1)| 00:00:01 | -| 4 | HASH GROUP BY | | 2736 | 256K| 3688 (1)| 00:00:01 | -|* 5 | HASH JOIN | | 2736 | 256K| 3686 (1)| 00:00:01 | -|* 6 | TABLE ACCESS FULL | DATE_DIM | 6087 | 79131 | 376 (1)| 00:00:01 | -| 7 | NESTED LOOPS | | 2865 | 232K| 3310 (1)| 00:00:01 | -| 8 | NESTED LOOPS | | 2865 | 232K| 3310 (1)| 00:00:01 | -|* 9 | TABLE ACCESS FULL | ITEM | 18 | 1188 | 375 (0)| 00:00:01 | -|* 10 | INDEX RANGE SCAN | SYS_C0010069 | 159 | | 2 (0)| 00:00:01 | -| 11 | TABLE ACCESS BY INDEX ROWID| STORE_SALES | 159 | 2703 | 163 (0)| 00:00:01 | --------------------------------------------------------------------------------------------------- - -OceanBase 数据库执行计划展示如下: -|ID|OPERATOR |NAME |EST. ROWS|COST | -------------------------------------------------------- -|0 |LIMIT | |100 |81141| -|1 | TOP-N SORT | |100 |81127| -|2 | HASH GROUP BY | |2924 |68551| -|3 | HASH JOIN | |2924 |65004| -|4 | SUBPLAN SCAN |VIEW1 |2953 |19070| -|5 | HASH GROUP BY | |2953 |18662| -|6 | NESTED-LOOP JOIN| |2953 |15080| -|7 | TABLE SCAN |ITEM |19 |11841| -|8 | TABLE SCAN |STORE_SALES|161 |73 | -|9 | TABLE SCAN |DT |6088 |29401| -======================================================= - -由示例可见,OceanBase 数据库的计划展示与 Oracle 数据库类似。 - -OceanBase 数据库执行计划中的各列的含义如下: -列名 含义 -ID 执行树按照前序遍历的方式得到的编号(从 0 开始)。 -OPERATOR 操作算子的名称。 -NAME 对应表操作的表名(索引名)。 -EST. ROWS 估算该操作算子的输出行数。 -COST 该操作算子的执行代价(微秒)。 - - -OceanBase 数据库 EXPLAIN 命令输出的第一部分是执行计划的树形结构展示。其中每一个操作在树中的层次通过其在 operator 中的缩进予以展示,层次最深的优先执行,层次相同的以特定算子的执行顺序为标准来执行。 - -问题: update a not exists (b…) -我一开始以为 B是驱动表,B的数据挺多的 后来看到NLAJ,是说左边的表关联右边的表 -所以这个的驱动表是不是实际是A,用A的匹配B的,这个理解有问题吗 - -回答: 没错 A 驱动 B的 - -问题: 光知道最下最右的是驱动表了 所以一开始搞得有点懵 :sweat_smile: - -回答: nlj应该原理应该都是左表(驱动表)的记录探测右表(被驱动表), 选哪张成为左表或右表就基于一些其他考量了,比如数据量, 而anti join/semi join只是对 not exist/exist的一种优化,相关的原理和资料网上可以查阅一下 - -问题: 也就是nlj 就是按照之前理解的谁先执行 谁就是驱动表 也就是执行计划中的最右的表 -而anti join/semi join,谁在not exist左面,谁就是驱动表。这么理解对吧 - -回答: nlj也是左表的表是驱动表,这个要了解下计划执行方面的基本原理,取左表的一行数据,再遍历右表,一旦满足连接条件,就可以返回数据 -anti/semi只是因为not exists/exist的语义只是返回左表数据,改成anti join是一种计划优化,连接的方式比子查询更优 -""" - -from llama_index import Document - -text_list = [text1] -documents = [Document(t) for t in text_list] - -num_output = 250 -max_input_size = 512 - -max_chunk_overlap = 20 -prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap) - -index = GPTListIndex( - documents, - embed_model=embed_model, - llm_predictor=llm_predictor, - prompt_helper=prompt_helper, -) -index.save_to_disk("index.json") - - -if __name__ == "__main__": - import logging - - logging.getLogger().setLevel(logging.CRITICAL) - for d in documents: - print(d) - - response = index.query("数据库的执行计划命令有多少?") - print(response)