a demo

2025-07-27 13:57:46 +00:00 · 2023-04-28 23:53:29 +08:00 · 2023-04-28 23:53:29 +08:00 · e5ffb6582c
commit e5ffb6582c
parent 0a30991456
2 changed files with 28 additions and 9 deletions
--- a/pilot/model/loader.py
+++ b/pilot/model/loader.py
@ -21,7 +21,6 @@ class ModerLoader:
        self.kwargs = {
            "torch_dtype": torch.float16,
            "device_map": "auto",
            "max_memory": get_gpu_memory(),
        }
    def loader(self, load_8bit=False, debug=False):
--- a/pilot/server/sqlgpt.py
+++ b/pilot/server/sqlgpt.py
@ -1,15 +1,16 @@
 #!/usr/bin/env python3
 #-*- coding: utf-8 -*-
 import json
 import torch
 from fastchat.serve.inference import generate_stream, compress_module
-BASE_MODE = "/home/magic/workspace/github/DB-GPT/models/vicuna-13b"
+
 from transformers import AutoTokenizer, AutoModelForCausalLM
 if __name__ == "__main__":
 device = "cuda" if torch.cuda.is_available() else "cpu"
 BASE_MODE = "/home/magic/workspace/github/DB-GPT/models/vicuna-13b"
 def generate(prompt):    
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODE, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODE, 
@ -17,7 +18,26 @@ if __name__ == "__main__":
        torch_dtype=torch.float16,
        device_map="auto",
    )
    print(device)
    # compress_module(model, device) 
    # model.to(device)
    print(model, tokenizer)
    params = {
        "model": "vicuna-13b",
        "prompt": prompt,
        "temperature": 0.7,
        "max_new_tokens": 512,
        "stop": "###"
    }
    output = generate_stream(
        model, tokenizer, params, device, context_len=2048, stream_interval=2)
    yield output
 if __name__ == "__main__":
    pass