feat: Support llama-2 model

2025-09-17 15:10:14 +00:00 · 2023-07-20 21:43:20 +08:00
parent 412b104797
commit 168c754a3f
12 changed files with 671 additions and 13 deletions
--- a/pilot/server/llmserver.py
+++ b/pilot/server/llmserver.py
@@ -77,6 +77,8 @@ class ModelWorker:

    def generate_stream_gate(self, params):
        try:
+            # params adaptation
+            params, model_context = self.llm_chat_adapter.model_adaptation(params)
            for output in self.generate_stream_func(
                self.model, self.tokenizer, params, DEVICE, CFG.MAX_POSITION_EMBEDDINGS
            ):
@@ -84,10 +86,8 @@ class ModelWorker:
                # The gpt4all thread shares stdout with the parent process,
                # and opening it may affect the frontend output.
                print("output: ", output)
-                ret = {
-                    "text": output,
-                    "error_code": 0,
-                }
+                # return some model context to dgt-server
+                ret = {"text": output, "error_code": 0, "model_context": model_context}
                yield json.dumps(ret).encode() + b"\0"

        except torch.cuda.CudaError: