diff --git a/asserts/exeable.png b/asserts/exeable.png new file mode 100644 index 000000000..47ee94f7d Binary files /dev/null and b/asserts/exeable.png differ diff --git a/pilot/server/chatbot.py b/pilot/agent/__init__.py similarity index 97% rename from pilot/server/chatbot.py rename to pilot/agent/__init__.py index 97206f2d5..c53f601b3 100644 --- a/pilot/server/chatbot.py +++ b/pilot/agent/__init__.py @@ -1,3 +1,2 @@ #!/usr/bin/env python3 # -*- coding:utf-8 -*- - diff --git a/pilot/agent/agent.py b/pilot/agent/agent.py new file mode 100644 index 000000000..61f65c359 --- /dev/null +++ b/pilot/agent/agent.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + + +class Agent: + """Agent class for interacting with DB-GPT """ + pass \ No newline at end of file diff --git a/pilot/agent/agent_manager.py b/pilot/agent/agent_manager.py new file mode 100644 index 000000000..ef33f36da --- /dev/null +++ b/pilot/agent/agent_manager.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +from pilot.singleton import Singleton + +class AgentManager(metaclass=Singleton): + """Agent manager for managing DB-GPT agents""" + def __init__(self) -> None: + + self.agents = {} #TODO need to define + + def create_agent(self): + pass + + def message_agent(self): + pass + + def list_agents(self): + pass + + def delete_agent(self): + pass + diff --git a/pilot/chain/audio.py b/pilot/chain/audio.py new file mode 100644 index 000000000..8b197119c --- /dev/null +++ b/pilot/chain/audio.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- \ No newline at end of file diff --git a/pilot/connections/pg_conn.py b/pilot/chain/visual.py similarity index 100% rename from pilot/connections/pg_conn.py rename to pilot/chain/visual.py diff --git a/pilot/client/__init__.py b/pilot/client/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pilot/configs/config.py b/pilot/configs/config.py new file mode 100644 index 000000000..0d74e97d9 --- /dev/null +++ b/pilot/configs/config.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from auto_gpt_plugin_template import AutoGPTPluginTemplate +from pilot.singleton import Singleton + +class Config(metaclass=Singleton): + """Configuration class to store the state of bools for different scripts access""" + def __init__(self) -> None: + """Initialize the Config class""" + pass + diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py index df0318e2d..a5c27d9d2 100644 --- a/pilot/configs/model_config.py +++ b/pilot/configs/model_config.py @@ -11,6 +11,7 @@ PILOT_PATH = os.path.join(ROOT_PATH, "pilot") VECTORE_PATH = os.path.join(PILOT_PATH, "vector_store") LOGDIR = os.path.join(ROOT_PATH, "logs") DATASETS_DIR = os.path.join(PILOT_PATH, "datasets") +DATA_DIR = os.path.join(PILOT_PATH, "data") nltk.data.path = [os.path.join(PILOT_PATH, "nltk_data")] + nltk.data.path diff --git a/pilot/connections/base.py b/pilot/connections/base.py new file mode 100644 index 000000000..318ce17a2 --- /dev/null +++ b/pilot/connections/base.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +"""We need to design a base class. That other connector can Write with this""" + +class BaseConnection: + pass + diff --git a/pilot/connections/clickhouse.py b/pilot/connections/clickhouse.py new file mode 100644 index 000000000..23f2660f9 --- /dev/null +++ b/pilot/connections/clickhouse.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +class ClickHouseConnector: + """ClickHouseConnector""" + pass \ No newline at end of file diff --git a/pilot/connections/es.py b/pilot/connections/es.py new file mode 100644 index 000000000..819d85ecf --- /dev/null +++ b/pilot/connections/es.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +class ElasticSearchConnector: + """ElasticSearchConnector""" + pass \ No newline at end of file diff --git a/pilot/connections/mongo.py b/pilot/connections/mongo.py new file mode 100644 index 000000000..b66aefdb3 --- /dev/null +++ b/pilot/connections/mongo.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +class MongoConnector: + """MongoConnector is a class which connect to mongo and chat with LLM""" + pass \ No newline at end of file diff --git a/pilot/connections/mysql_conn.py b/pilot/connections/mysql.py similarity index 85% rename from pilot/connections/mysql_conn.py rename to pilot/connections/mysql.py index 2dfff2ee7..acc59a144 100644 --- a/pilot/connections/mysql_conn.py +++ b/pilot/connections/mysql.py @@ -4,7 +4,11 @@ import pymysql class MySQLOperator: - """Connect MySQL Database fetch MetaData For LLM Prompt """ + """Connect MySQL Database fetch MetaData For LLM Prompt + Args: + + Usage: + """ default_db = ["information_schema", "performance_schema", "sys", "mysql"] def __init__(self, user, password, host="localhost", port=3306) -> None: @@ -26,6 +30,9 @@ class MySQLOperator: cursor.execute(_sql) results = cursor.fetchall() return results + + def get_index(self, schema_name): + pass def get_db_list(self): with self.conn.cursor() as cursor: @@ -38,5 +45,7 @@ class MySQLOperator: dbs = [d["Database"] for d in results if d["Database"] not in self.default_db] return dbs + def get_meta(self, schema_name): + pass diff --git a/pilot/connections/oracle.py b/pilot/connections/oracle.py new file mode 100644 index 000000000..4ce4e742a --- /dev/null +++ b/pilot/connections/oracle.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +class OracleConnector: + """OracleConnector""" + pass \ No newline at end of file diff --git a/pilot/connections/postgres.py b/pilot/connections/postgres.py new file mode 100644 index 000000000..3e1df00ab --- /dev/null +++ b/pilot/connections/postgres.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + + +class PostgresConnector: + """PostgresConnector is a class which Connector to chat with LLM""" + pass \ No newline at end of file diff --git a/pilot/connections/redis.py b/pilot/connections/redis.py new file mode 100644 index 000000000..ac00ade63 --- /dev/null +++ b/pilot/connections/redis.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + + +class RedisConnector: + """RedisConnector""" + pass \ No newline at end of file diff --git a/pilot/conversation.py b/pilot/conversation.py index 2dc8df2b9..e1715e427 100644 --- a/pilot/conversation.py +++ b/pilot/conversation.py @@ -89,7 +89,7 @@ class Conversation: def gen_sqlgen_conversation(dbname): - from pilot.connections.mysql_conn import MySQLOperator + from pilot.connections.mysql import MySQLOperator mo = MySQLOperator( **DB_SETTINGS ) diff --git a/pilot/model/loader.py b/pilot/model/loader.py index e601621f7..747585fa4 100644 --- a/pilot/model/loader.py +++ b/pilot/model/loader.py @@ -10,7 +10,12 @@ from transformers import ( from fastchat.serve.compression import compress_module -class ModerLoader: +class ModelLoader: + """Model loader is a class for model load + + Args: model_path + + """ kwargs = {} diff --git a/pilot/pturning/lora/finetune.py b/pilot/pturning/lora/finetune.py new file mode 100644 index 000000000..6cd9935ed --- /dev/null +++ b/pilot/pturning/lora/finetune.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import json +import transformers +from transformers import LlamaTokenizer, LlamaForCausalLM + +from typing import List +from peft import ( + LoraConfig, + get_peft_model, + get_peft_model_state_dict, + prepare_model_for_int8_training, +) + +import torch +from datasets import load_dataset +import pandas as pd + + +from pilot.configs.model_config import DATA_DIR, LLM_MODEL, LLM_MODEL_CONFIG +device = "cuda" if torch.cuda.is_available() else "cpu" +CUTOFF_LEN = 50 + +df = pd.read_csv(os.path.join(DATA_DIR, "BTC_Tweets_Updated.csv")) + +def sentiment_score_to_name(score: float): + if score > 0: + return "Positive" + elif score < 0: + return "Negative" + return "Neutral" + + +dataset_data = [ + { + "instruction": "Detect the sentiment of the tweet.", + "input": row_dict["Tweet"], + "output": sentiment_score_to_name(row_dict["New_Sentiment_State"]) + } + for row_dict in df.to_dict(orient="records") +] + +with open(os.path.join(DATA_DIR, "alpaca-bitcoin-sentiment-dataset.json"), "w") as f: + json.dump(dataset_data, f) + + +data = load_dataset("json", data_files=os.path.join(DATA_DIR, "alpaca-bitcoin-sentiment-dataset.json")) +print(data["train"]) + +BASE_MODEL = LLM_MODEL_CONFIG[LLM_MODEL] +model = LlamaForCausalLM.from_pretrained( + BASE_MODEL, + torch_dtype=torch.float16, + device_map="auto", + offload_folder=os.path.join(DATA_DIR, "vicuna-lora") +) + +tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL) +tokenizer.pad_token_id = (0) +tokenizer.padding_side = "left" + +def generate_prompt(data_point): + return f"""Blow is an instruction that describes a task, paired with an input that provide future context. + Write a response that appropriately completes the request. #noqa: + + ### Instruct: + {data_point["instruction"]} + ### Input + {data_point["input"]} + ### Response + {data_point["output"]} + """ + +def tokenize(prompt, add_eos_token=True): + result = tokenizer( + prompt, + truncation=True, + max_length=CUTOFF_LEN, + padding=False, + return_tensors=None, + ) + + if (result["input_ids"][-1] != tokenizer.eos_token_id and len(result["input_ids"]) < CUTOFF_LEN and add_eos_token): + result["input_ids"].append(tokenizer.eos_token_id) + result["attention_mask"].append(1) + + result["labels"] = result["input_ids"].copy() + return result + +def generate_and_tokenize_prompt(data_point): + full_prompt = generate_prompt(data_point) + tokenized_full_prompt = tokenize(full_prompt) + return tokenized_full_prompt + + +train_val = data["train"].train_test_split( + test_size=200, shuffle=True, seed=42 +) + +train_data = ( + train_val["train"].map(generate_and_tokenize_prompt) +) + +val_data = ( + train_val["test"].map(generate_and_tokenize_prompt) +) + +# Training +LORA_R = 8 +LORA_ALPHA = 16 +LORA_DROPOUT = 0.05 +LORA_TARGET_MODULES = [ + "q_proj", + "v_proj", +] + +BATCH_SIZE = 128 +MICRO_BATCH_SIZE = 4 +GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE +LEARNING_RATE = 3e-4 +TRAIN_STEPS = 300 +OUTPUT_DIR = "experiments" + +# We can now prepare model for training +model = prepare_model_for_int8_training(model) +config = LoraConfig( + r = LORA_R, + lora_alpha=LORA_ALPHA, + target_modules=LORA_TARGET_MODULES, + lora_dropout=LORA_DROPOUT, + bias="none", + task_type="CAUSAL_LM", +) + +model = get_peft_model(model, config) +model.print_trainable_parameters() + +training_arguments = transformers.TrainingArguments( + per_device_train_batch_size=MICRO_BATCH_SIZE, + gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, + warmup_steps=100, + max_steps=TRAIN_STEPS, + no_cuda=True, + learning_rate=LEARNING_RATE, + logging_steps=10, + optim="adamw_torch", + evaluation_strategy="steps", + save_strategy="steps", + eval_steps=50, + save_steps=50, + output_dir=OUTPUT_DIR, + save_total_limit=3, + load_best_model_at_end=True, + report_to="tensorboard" +) + +data_collector = transformers.DataCollatorForSeq2Seq( + tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True +) + +trainer = transformers.Trainer( + model=model, + train_dataset=train_data, + eval_dataset=val_data, + args=training_arguments, + data_collector=data_collector +) + +model.config.use_cache = False +old_state_dict = model.state_dict +model.state_dict = ( + lambda self, *_, **__: get_peft_model_state_dict( + self, old_state_dict() + ) +).__get__(model, type(model)) + +trainer.train() +model.save_pretrained(OUTPUT_DIR) diff --git a/pilot/server/vicuna_server.py b/pilot/server/vicuna_server.py index 674afb71b..868e8b6d9 100644 --- a/pilot/server/vicuna_server.py +++ b/pilot/server/vicuna_server.py @@ -13,7 +13,7 @@ from pilot.model.inference import generate_output, get_embeddings from fastchat.serve.inference import load_model -from pilot.model.loader import ModerLoader +from pilot.model.loader import ModelLoader from pilot.configs.model_config import * model_path = LLM_MODEL_CONFIG[LLM_MODEL] @@ -22,7 +22,7 @@ model_path = LLM_MODEL_CONFIG[LLM_MODEL] global_counter = 0 model_semaphore = None -ml = ModerLoader(model_path=model_path) +ml = ModelLoader(model_path=model_path) model, tokenizer = ml.loader(num_gpus=1, load_8bit=ISLOAD_8BIT, debug=ISDEBUG) #model, tokenizer = load_model(model_path=model_path, device=DEVICE, num_gpus=1, load_8bit=True, debug=False) diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py index b31bfde7f..bbe710667 100644 --- a/pilot/server/webserver.py +++ b/pilot/server/webserver.py @@ -12,7 +12,7 @@ import requests from urllib.parse import urljoin from pilot.configs.model_config import DB_SETTINGS from pilot.server.vectordb_qa import KnownLedgeBaseQA -from pilot.connections.mysql_conn import MySQLOperator +from pilot.connections.mysql import MySQLOperator from pilot.vector_store.extract_tovec import get_vector_storelist, load_knownledge_from_doc, knownledge_tovec_st from pilot.configs.model_config import LOGDIR, VICUNA_MODEL_SERVER, LLM_MODEL, DATASETS_DIR diff --git a/pilot/singleton.py b/pilot/singleton.py new file mode 100644 index 000000000..8a9d6e2fa --- /dev/null +++ b/pilot/singleton.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +"""The singleton metaclass for ensuring only one instance of a class.""" +import abc +from typing import Any + +class Singleton(abc.ABCMeta, type): + """ Singleton metaclass for ensuring only one instance of a class""" + + _instances = {} + def __call__(cls, *args: Any, **kwargs: Any) -> Any: + """Call method for the singleton metaclass""" + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) + return cls._instances[cls] + + +class AbstractSingleton(abc.ABC, metaclass=Singleton): + """Abstract singleton class for ensuring only one instance of a class""" + pass \ No newline at end of file diff --git a/pilot/vector_store/file_loader.py b/pilot/vector_store/file_loader.py index 881c8106f..296232f21 100644 --- a/pilot/vector_store/file_loader.py +++ b/pilot/vector_store/file_loader.py @@ -15,6 +15,20 @@ from pilot.configs.model_config import VECTORE_PATH, DATASETS_DIR, LLM_MODEL_CON class KnownLedge2Vector: + """KnownLedge2Vector class is order to load document to vector + and persist to vector store. + + Args: + - model_name + + Usage: + k2v = KnownLedge2Vector() + persist_dir = os.path.join(VECTORE_PATH, ".vectordb") + print(persist_dir) + for s, dc in k2v.query("what is oceanbase?"): + print(s, dc.page_content, dc.metadata) + + """ embeddings: object = None model_name = LLM_MODEL_CONFIG["sentence-transforms"] top_k: int = VECTOR_SEARCH_TOP_K @@ -81,11 +95,4 @@ class KnownLedge2Vector: dc, s = doc yield s, dc -if __name__ == "__main__": - k2v = KnownLedge2Vector() - - persist_dir = os.path.join(VECTORE_PATH, ".vectordb") - print(persist_dir) - for s, dc in k2v.query("什么是OceanBase"): - print(s, dc.page_content, dc.metadata) \ No newline at end of file