diff --git a/pilot/configs/model_config.py b/pilot/configs/model_config.py index 249979477..80b1dabe4 100644 --- a/pilot/configs/model_config.py +++ b/pilot/configs/model_config.py @@ -19,7 +19,7 @@ llm_model_config = { LLM_MODEL = "vicuna-13b" LIMIT_MODEL_CONCURRENCY = 5 MAX_POSITION_EMBEDDINGS = 2048 -vicuna_model_server = "http://192.168.31.114:21002" +vicuna_model_server = "http://192.168.31.114:8000" # Load model config diff --git a/pilot/server/webserver.py b/pilot/server/webserver.py index 8bd977cbb..19ee4b697 100644 --- a/pilot/server/webserver.py +++ b/pilot/server/webserver.py @@ -157,7 +157,7 @@ def http_bot(state, temperature, max_new_tokens, request: gr.Request): try: # Stream output - response = requests.post(urljoin(vicuna_model_server, "worker_generate_stream"), + response = requests.post(urljoin(vicuna_model_server, "generate_stream"), headers=headers, json=payload, stream=True, timeout=20) for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"): if chunk: