From e5ffb6582cb3957c342e19aaa84946ab8526266c Mon Sep 17 00:00:00 2001 From: csunny Date: Fri, 28 Apr 2023 23:53:29 +0800 Subject: [PATCH] a demo --- pilot/model/loader.py | 1 - pilot/server/sqlgpt.py | 36 ++++++++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/pilot/model/loader.py b/pilot/model/loader.py index 7691a8092..98af18982 100644 --- a/pilot/model/loader.py +++ b/pilot/model/loader.py @@ -21,7 +21,6 @@ class ModerLoader: self.kwargs = { "torch_dtype": torch.float16, "device_map": "auto", - "max_memory": get_gpu_memory(), } def loader(self, load_8bit=False, debug=False): diff --git a/pilot/server/sqlgpt.py b/pilot/server/sqlgpt.py index 582775953..5bf2244cc 100644 --- a/pilot/server/sqlgpt.py +++ b/pilot/server/sqlgpt.py @@ -1,23 +1,43 @@ #!/usr/bin/env python3 #-*- coding: utf-8 -*- +import json import torch from fastchat.serve.inference import generate_stream, compress_module -BASE_MODE = "/home/magic/workspace/github/DB-GPT/models/vicuna-13b" + from transformers import AutoTokenizer, AutoModelForCausalLM +device = "cuda" if torch.cuda.is_available() else "cpu" +BASE_MODE = "/home/magic/workspace/github/DB-GPT/models/vicuna-13b" -if __name__ == "__main__": - - device = "cuda" if torch.cuda.is_available() else "cpu" +def generate(prompt): tokenizer = AutoTokenizer.from_pretrained(BASE_MODE, use_fast=False) model = AutoModelForCausalLM.from_pretrained( BASE_MODE, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto", - ) + ) + # compress_module(model, device) + # model.to(device) + print(model, tokenizer) + + params = { + "model": "vicuna-13b", + "prompt": prompt, + "temperature": 0.7, + "max_new_tokens": 512, + "stop": "###" + } + output = generate_stream( + model, tokenizer, params, device, context_len=2048, stream_interval=2) + + yield output + +if __name__ == "__main__": + pass + + + + - print(device) - #compress_module(model, device) - print(model, tokenizer) \ No newline at end of file