From e5ffb6582cb3957c342e19aaa84946ab8526266c Mon Sep 17 00:00:00 2001
From: csunny <cfqcsunny@gmail.com>
Date: Fri, 28 Apr 2023 23:53:29 +0800
Subject: [PATCH] a demo

---
 pilot/model/loader.py  |  1 -
 pilot/server/sqlgpt.py | 36 ++++++++++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/pilot/model/loader.py b/pilot/model/loader.py
index 7691a8092..98af18982 100644
--- a/pilot/model/loader.py
+++ b/pilot/model/loader.py
@@ -21,7 +21,6 @@ class ModerLoader:
         self.kwargs = {
             "torch_dtype": torch.float16,
             "device_map": "auto",
-            "max_memory": get_gpu_memory(),
         }
 
     def loader(self, load_8bit=False, debug=False):
diff --git a/pilot/server/sqlgpt.py b/pilot/server/sqlgpt.py
index 582775953..5bf2244cc 100644
--- a/pilot/server/sqlgpt.py
+++ b/pilot/server/sqlgpt.py
@@ -1,23 +1,43 @@
 #!/usr/bin/env python3
 #-*- coding: utf-8 -*-
 
+import json
 import torch
 from fastchat.serve.inference import generate_stream, compress_module
 
-BASE_MODE = "/home/magic/workspace/github/DB-GPT/models/vicuna-13b"
+
 from transformers import AutoTokenizer, AutoModelForCausalLM
+device = "cuda" if torch.cuda.is_available() else "cpu"
+BASE_MODE = "/home/magic/workspace/github/DB-GPT/models/vicuna-13b"
 
-if __name__ == "__main__":
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
+def generate(prompt):    
     tokenizer = AutoTokenizer.from_pretrained(BASE_MODE, use_fast=False)
     model = AutoModelForCausalLM.from_pretrained(
         BASE_MODE, 
         low_cpu_mem_usage=True, 
         torch_dtype=torch.float16,
         device_map="auto",
-        )
+    )
+    # compress_module(model, device) 
+    # model.to(device)
+    print(model, tokenizer)
+
+    params = {
+        "model": "vicuna-13b",
+        "prompt": prompt,
+        "temperature": 0.7,
+        "max_new_tokens": 512,
+        "stop": "###"
+    }
+    output = generate_stream(
+        model, tokenizer, params, device, context_len=2048, stream_interval=2)
+
+    yield output
+
+if __name__ == "__main__":
+    pass
+
+
+
+
 
-    print(device)
-    #compress_module(model, device) 
-    print(model, tokenizer)
\ No newline at end of file