mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-13 05:01:25 +00:00
refactor: The first refactored version for sdk release (#907)
Co-authored-by: chengfangyin2 <chengfangyin3@jd.com>
This commit is contained in:
0
dbgpt/util/benchmarks/__init__.py
Normal file
0
dbgpt/util/benchmarks/__init__.py
Normal file
0
dbgpt/util/benchmarks/llm/__init__.py
Normal file
0
dbgpt/util/benchmarks/llm/__init__.py
Normal file
296
dbgpt/util/benchmarks/llm/fastchat_benchmarks_inference.py
Normal file
296
dbgpt/util/benchmarks/llm/fastchat_benchmarks_inference.py
Normal file
@@ -0,0 +1,296 @@
|
||||
"""
|
||||
Adapted from fastchat: https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/inference.py.
|
||||
For benchmarks.
|
||||
|
||||
"""
|
||||
import gc
|
||||
from typing import Iterable, Dict
|
||||
|
||||
import torch
|
||||
from transformers.generation.logits_process import (
|
||||
LogitsProcessorList,
|
||||
RepetitionPenaltyLogitsProcessor,
|
||||
TemperatureLogitsWarper,
|
||||
TopKLogitsWarper,
|
||||
TopPLogitsWarper,
|
||||
)
|
||||
|
||||
|
||||
from fastchat.utils import is_partial_stop, is_sentence_complete, get_context_length
|
||||
|
||||
|
||||
def prepare_logits_processor(
|
||||
temperature: float, repetition_penalty: float, top_p: float, top_k: int
|
||||
) -> LogitsProcessorList:
|
||||
processor_list = LogitsProcessorList()
|
||||
# TemperatureLogitsWarper doesn't accept 0.0, 1.0 makes it a no-op so we skip two cases.
|
||||
if temperature >= 1e-5 and temperature != 1.0:
|
||||
processor_list.append(TemperatureLogitsWarper(temperature))
|
||||
if repetition_penalty > 1.0:
|
||||
processor_list.append(RepetitionPenaltyLogitsProcessor(repetition_penalty))
|
||||
if 1e-8 <= top_p < 1.0:
|
||||
processor_list.append(TopPLogitsWarper(top_p))
|
||||
if top_k > 0:
|
||||
processor_list.append(TopKLogitsWarper(top_k))
|
||||
return processor_list
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def generate_stream(
|
||||
model,
|
||||
tokenizer,
|
||||
params: Dict,
|
||||
device: str,
|
||||
context_len: int,
|
||||
stream_interval: int = 1,
|
||||
judge_sent_end: bool = False,
|
||||
):
|
||||
if hasattr(model, "device"):
|
||||
device = model.device
|
||||
|
||||
# Read parameters
|
||||
prompt = params["prompt"]
|
||||
len_prompt = len(prompt)
|
||||
temperature = float(params.get("temperature", 1.0))
|
||||
repetition_penalty = float(params.get("repetition_penalty", 1.0))
|
||||
top_p = float(params.get("top_p", 1.0))
|
||||
top_k = int(params.get("top_k", -1)) # -1 means disable
|
||||
max_new_tokens = int(params.get("max_new_tokens", 256))
|
||||
logprobs = params.get("logprobs", None) # FIXME: Support logprobs>1.
|
||||
echo = bool(params.get("echo", True))
|
||||
stop_str = params.get("stop", None)
|
||||
stop_token_ids = params.get("stop_token_ids", None) or []
|
||||
if tokenizer.eos_token_id not in stop_token_ids:
|
||||
stop_token_ids.append(tokenizer.eos_token_id)
|
||||
|
||||
logits_processor = prepare_logits_processor(
|
||||
temperature, repetition_penalty, top_p, top_k
|
||||
)
|
||||
input_ids = tokenizer(prompt).input_ids
|
||||
|
||||
if model.config.is_encoder_decoder:
|
||||
max_src_len = context_len
|
||||
else: # truncate
|
||||
max_src_len = context_len - max_new_tokens - 1
|
||||
|
||||
input_ids = input_ids[-max_src_len:]
|
||||
output_ids = list(input_ids)
|
||||
input_echo_len = len(input_ids)
|
||||
|
||||
# Don't stop generate until max_new_tokens is reached.
|
||||
stop_token_ids = []
|
||||
stop_str = None
|
||||
|
||||
if model.config.is_encoder_decoder:
|
||||
if logprobs is not None: # FIXME: Support logprobs for encoder-decoder models.
|
||||
raise NotImplementedError
|
||||
encoder_output = model.encoder(
|
||||
input_ids=torch.as_tensor([input_ids], device=device)
|
||||
)[0]
|
||||
start_ids = torch.as_tensor(
|
||||
[[model.generation_config.decoder_start_token_id]],
|
||||
dtype=torch.int64,
|
||||
device=device,
|
||||
)
|
||||
else:
|
||||
start_ids = torch.as_tensor([input_ids], device=device)
|
||||
|
||||
past_key_values = out = None
|
||||
token_logprobs = [None] # The first token has no logprobs.
|
||||
sent_interrupt = False
|
||||
finish_reason = None
|
||||
for i in range(max_new_tokens):
|
||||
if i == 0: # prefill
|
||||
if model.config.is_encoder_decoder:
|
||||
out = model.decoder(
|
||||
input_ids=start_ids,
|
||||
encoder_hidden_states=encoder_output,
|
||||
use_cache=True,
|
||||
)
|
||||
logits = model.lm_head(out[0])
|
||||
else:
|
||||
out = model(input_ids=start_ids, use_cache=True)
|
||||
logits = out.logits
|
||||
past_key_values = out.past_key_values
|
||||
|
||||
if logprobs is not None:
|
||||
# Prefull logprobs for the prompt.
|
||||
shift_input_ids = start_ids[..., 1:].contiguous()
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_logits = torch.log_softmax(shift_logits, dim=-1).tolist()
|
||||
for label_id, logit in zip(
|
||||
shift_input_ids[0].tolist(), shift_logits[0]
|
||||
):
|
||||
token_logprobs.append(logit[label_id])
|
||||
else: # decoding
|
||||
if model.config.is_encoder_decoder:
|
||||
out = model.decoder(
|
||||
input_ids=torch.as_tensor(
|
||||
[[token] if not sent_interrupt else output_ids],
|
||||
device=device,
|
||||
),
|
||||
encoder_hidden_states=encoder_output,
|
||||
use_cache=True,
|
||||
past_key_values=past_key_values if not sent_interrupt else None,
|
||||
)
|
||||
sent_interrupt = False
|
||||
|
||||
logits = model.lm_head(out[0])
|
||||
else:
|
||||
out = model(
|
||||
input_ids=torch.as_tensor(
|
||||
[[token] if not sent_interrupt else output_ids],
|
||||
device=device,
|
||||
),
|
||||
use_cache=True,
|
||||
past_key_values=past_key_values if not sent_interrupt else None,
|
||||
)
|
||||
sent_interrupt = False
|
||||
logits = out.logits
|
||||
past_key_values = out.past_key_values
|
||||
|
||||
if logits_processor:
|
||||
if repetition_penalty > 1.0:
|
||||
tmp_output_ids = torch.as_tensor([output_ids], device=logits.device)
|
||||
else:
|
||||
tmp_output_ids = None
|
||||
last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
|
||||
else:
|
||||
last_token_logits = logits[0, -1, :]
|
||||
|
||||
if device == "mps":
|
||||
# Switch to CPU by avoiding some bugs in mps backend.
|
||||
last_token_logits = last_token_logits.float().to("cpu")
|
||||
|
||||
if temperature < 1e-5 or top_p < 1e-8: # greedy
|
||||
_, indices = torch.topk(last_token_logits, 2)
|
||||
tokens = [int(index) for index in indices.tolist()]
|
||||
else:
|
||||
probs = torch.softmax(last_token_logits, dim=-1)
|
||||
indices = torch.multinomial(probs, num_samples=2)
|
||||
tokens = [int(token) for token in indices.tolist()]
|
||||
token = tokens[0]
|
||||
output_ids.append(token)
|
||||
if logprobs is not None:
|
||||
# Cannot use last_token_logits because logprobs is based on raw logits.
|
||||
token_logprobs.append(
|
||||
torch.log_softmax(logits[0, -1, :], dim=-1)[token].tolist()
|
||||
)
|
||||
|
||||
if token in stop_token_ids:
|
||||
stopped = True
|
||||
else:
|
||||
stopped = False
|
||||
|
||||
# Yield the output tokens
|
||||
if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
|
||||
if echo:
|
||||
tmp_output_ids = output_ids
|
||||
rfind_start = len_prompt
|
||||
else:
|
||||
tmp_output_ids = output_ids[input_echo_len:]
|
||||
rfind_start = 0
|
||||
|
||||
output = tokenizer.decode(
|
||||
tmp_output_ids,
|
||||
skip_special_tokens=True,
|
||||
spaces_between_special_tokens=False,
|
||||
clean_up_tokenization_spaces=True,
|
||||
)
|
||||
ret_logprobs = None
|
||||
if logprobs is not None:
|
||||
ret_logprobs = {
|
||||
"text_offset": [],
|
||||
"tokens": [
|
||||
tokenizer.decode(token)
|
||||
for token in (
|
||||
output_ids if echo else output_ids[input_echo_len:]
|
||||
)
|
||||
],
|
||||
"token_logprobs": token_logprobs
|
||||
if echo
|
||||
else token_logprobs[input_echo_len:],
|
||||
"top_logprobs": [{}]
|
||||
* len(token_logprobs if echo else token_logprobs[input_echo_len:]),
|
||||
}
|
||||
# Compute text_offset
|
||||
curr_pos = 0
|
||||
for text in ret_logprobs["tokens"]:
|
||||
ret_logprobs["text_offset"].append(curr_pos)
|
||||
curr_pos += len(text)
|
||||
|
||||
# TODO: For the issue of incomplete sentences interrupting output, apply a patch and others can also modify it to a more elegant way
|
||||
if judge_sent_end and stopped and not is_sentence_complete(output):
|
||||
if len(tokens) > 1:
|
||||
token = tokens[1]
|
||||
output_ids[-1] = token
|
||||
else:
|
||||
output_ids.pop()
|
||||
stopped = False
|
||||
sent_interrupt = True
|
||||
|
||||
partially_stopped = False
|
||||
if stop_str:
|
||||
if isinstance(stop_str, str):
|
||||
pos = output.rfind(stop_str, rfind_start)
|
||||
if pos != -1:
|
||||
output = output[:pos]
|
||||
stopped = True
|
||||
else:
|
||||
partially_stopped = is_partial_stop(output, stop_str)
|
||||
elif isinstance(stop_str, Iterable):
|
||||
for each_stop in stop_str:
|
||||
pos = output.rfind(each_stop, rfind_start)
|
||||
if pos != -1:
|
||||
output = output[:pos]
|
||||
stopped = True
|
||||
break
|
||||
else:
|
||||
partially_stopped = is_partial_stop(output, each_stop)
|
||||
if partially_stopped:
|
||||
break
|
||||
else:
|
||||
raise ValueError("Invalid stop field type.")
|
||||
|
||||
# Prevent yielding partial stop sequence
|
||||
if not partially_stopped:
|
||||
yield {
|
||||
"text": output,
|
||||
"logprobs": ret_logprobs,
|
||||
"usage": {
|
||||
"prompt_tokens": input_echo_len,
|
||||
"completion_tokens": i,
|
||||
"total_tokens": input_echo_len + i,
|
||||
},
|
||||
"finish_reason": None,
|
||||
}
|
||||
|
||||
if stopped:
|
||||
break
|
||||
|
||||
# Finish stream event, which contains finish reason
|
||||
else:
|
||||
finish_reason = "length"
|
||||
|
||||
if stopped:
|
||||
finish_reason = "stop"
|
||||
|
||||
yield {
|
||||
"text": output,
|
||||
"logprobs": ret_logprobs,
|
||||
"usage": {
|
||||
"prompt_tokens": input_echo_len,
|
||||
"completion_tokens": i,
|
||||
"total_tokens": input_echo_len + i,
|
||||
},
|
||||
"finish_reason": finish_reason,
|
||||
}
|
||||
|
||||
# Clean
|
||||
del past_key_values, out
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
if device == "xpu":
|
||||
torch.xpu.empty_cache()
|
||||
if device == "npu":
|
||||
torch.npu.empty_cache()
|
282
dbgpt/util/benchmarks/llm/llm_benchmarks.py
Normal file
282
dbgpt/util/benchmarks/llm/llm_benchmarks.py
Normal file
@@ -0,0 +1,282 @@
|
||||
from typing import Dict, List
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import csv
|
||||
import argparse
|
||||
import logging
|
||||
import traceback
|
||||
from dbgpt.configs.model_config import ROOT_PATH, LLM_MODEL_CONFIG
|
||||
from datetime import datetime
|
||||
|
||||
from dbgpt.model.cluster.worker.manager import (
|
||||
run_worker_manager,
|
||||
initialize_worker_manager_in_client,
|
||||
WorkerManager,
|
||||
)
|
||||
|
||||
from dbgpt.core import ModelOutput, ModelInferenceMetrics
|
||||
from dbgpt.core.interface.message import ModelMessage, ModelMessageRoleType
|
||||
|
||||
|
||||
model_name = "vicuna-7b-v1.5"
|
||||
model_path = LLM_MODEL_CONFIG[model_name]
|
||||
# or vllm
|
||||
model_type = "huggingface"
|
||||
|
||||
controller_addr = "http://127.0.0.1:5000"
|
||||
|
||||
result_csv_file = None
|
||||
|
||||
parallel_nums = [1, 2, 4, 16, 32]
|
||||
# parallel_nums = [1, 2, 4]
|
||||
|
||||
|
||||
def get_result_csv_file() -> str:
|
||||
return os.path.join(
|
||||
ROOT_PATH, f"pilot/data/{model_name}_{model_type}_benchmarks_llm.csv"
|
||||
)
|
||||
|
||||
|
||||
input_lens = [64, 64]
|
||||
output_lens = [256, 512]
|
||||
|
||||
|
||||
prompt_file_map = {
|
||||
"11k": os.path.join(
|
||||
ROOT_PATH, "docker/examples/benchmarks/benchmarks_llm_11k_prompt.txt"
|
||||
)
|
||||
}
|
||||
|
||||
METRICS_HEADERS = [
|
||||
# Params
|
||||
"model_name",
|
||||
"gpu_nums",
|
||||
"parallel_nums",
|
||||
"input_length",
|
||||
"output_length",
|
||||
# Merge parallel result
|
||||
"test_time_cost_ms",
|
||||
"test_total_tokens",
|
||||
# avg_test_speed_per_second: (tokens / s), test_total_tokens / (test_time_cost_ms / 1000.0)
|
||||
"avg_test_speed_per_second(tokens/s)",
|
||||
# avg_first_token_latency_ms: sum(first_token_time_ms) / parallel_nums
|
||||
"avg_first_token_latency_ms",
|
||||
# avg_latency_ms: sum(end_time_ms - start_time_ms) / parallel_nums
|
||||
"avg_latency_ms",
|
||||
"gpu_mem(GiB)",
|
||||
# Detail for each task
|
||||
"start_time_ms",
|
||||
"end_time_ms",
|
||||
"current_time_ms",
|
||||
"first_token_time_ms",
|
||||
"first_completion_time_ms",
|
||||
"first_completion_tokens",
|
||||
"prompt_tokens",
|
||||
"completion_tokens",
|
||||
"total_tokens",
|
||||
"speed_per_second",
|
||||
]
|
||||
|
||||
|
||||
def read_prompt_from_file(file_key: str) -> str:
|
||||
full_path = prompt_file_map[file_key]
|
||||
with open(full_path, "r+", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def build_param(
|
||||
input_len: int,
|
||||
output_len: int,
|
||||
user_input: str,
|
||||
system_prompt: str = None,
|
||||
) -> Dict:
|
||||
hist = []
|
||||
if system_prompt is not None:
|
||||
hist.append(
|
||||
ModelMessage(role=ModelMessageRoleType.SYSTEM, content=system_prompt)
|
||||
)
|
||||
hist.append(ModelMessage(role=ModelMessageRoleType.HUMAN, content=user_input))
|
||||
hist = list(h.dict() for h in hist)
|
||||
context_len = input_len + output_len + 2
|
||||
params = {
|
||||
"prompt": user_input,
|
||||
"messages": hist,
|
||||
"model": model_name,
|
||||
"echo": False,
|
||||
"max_new_tokens": output_len,
|
||||
"context_len": context_len,
|
||||
}
|
||||
return params
|
||||
|
||||
|
||||
async def run_batch(
|
||||
wh: WorkerManager,
|
||||
input_len: int,
|
||||
output_len: int,
|
||||
parallel_num: int,
|
||||
output_file: str,
|
||||
):
|
||||
tasks = []
|
||||
prompt = read_prompt_from_file("11k")
|
||||
if model_type == "vllm":
|
||||
max_input_str_len = input_len
|
||||
if "baichuan" in model_name:
|
||||
# TODO prompt handle first
|
||||
max_input_str_len *= 2
|
||||
prompt = prompt[-max_input_str_len:]
|
||||
|
||||
# Warmup first
|
||||
params = build_param(input_len, output_len, prompt, system_prompt="")
|
||||
await wh.generate(params)
|
||||
|
||||
for _ in range(parallel_num):
|
||||
params = build_param(input_len, output_len, prompt, system_prompt="")
|
||||
tasks.append(wh.generate(params))
|
||||
print(
|
||||
f"Begin run benchmarks, model name: {model_name}, input_len: {input_len}, output_len: {output_len}, parallel_num: {parallel_num}, save result to {output_file}"
|
||||
)
|
||||
start_time_ms = time.time_ns() // 1_000_000
|
||||
results: List[ModelOutput] = await asyncio.gather(*tasks)
|
||||
end_time_ms = time.time_ns() // 1_000_000
|
||||
|
||||
test_time_cost_ms = end_time_ms - start_time_ms
|
||||
test_total_tokens = 0
|
||||
first_token_latency_ms = 0
|
||||
latency_ms = 0
|
||||
gpu_nums = 0
|
||||
avg_gpu_mem = 0
|
||||
rows = []
|
||||
for r in results:
|
||||
metrics = r.metrics
|
||||
if isinstance(metrics, dict):
|
||||
metrics = ModelInferenceMetrics(**metrics)
|
||||
print(r)
|
||||
test_total_tokens += metrics.total_tokens
|
||||
first_token_latency_ms += metrics.first_token_time_ms - metrics.start_time_ms
|
||||
latency_ms += metrics.end_time_ms - metrics.start_time_ms
|
||||
row_data = metrics.to_dict()
|
||||
del row_data["collect_index"]
|
||||
if "avg_gpu_infos" in row_data:
|
||||
avg_gpu_infos = row_data["avg_gpu_infos"]
|
||||
gpu_nums = len(avg_gpu_infos)
|
||||
avg_gpu_mem = (
|
||||
sum(i["allocated_memory_gb"] for i in avg_gpu_infos) / gpu_nums
|
||||
)
|
||||
del row_data["avg_gpu_infos"]
|
||||
del row_data["current_gpu_infos"]
|
||||
rows.append(row_data)
|
||||
avg_test_speed_per_second = test_total_tokens / (test_time_cost_ms / 1000.0)
|
||||
avg_first_token_latency_ms = first_token_latency_ms / len(results)
|
||||
avg_latency_ms = latency_ms / len(results)
|
||||
|
||||
with open(output_file, "a", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=METRICS_HEADERS)
|
||||
if f.tell() == 0:
|
||||
# Fist time
|
||||
writer.writeheader()
|
||||
for row in rows:
|
||||
row["model_name"] = model_name
|
||||
row["parallel_nums"] = parallel_num
|
||||
row["input_length"] = input_len
|
||||
row["output_length"] = output_len
|
||||
row["test_time_cost_ms"] = test_time_cost_ms
|
||||
row["test_total_tokens"] = test_total_tokens
|
||||
row["avg_test_speed_per_second(tokens/s)"] = avg_test_speed_per_second
|
||||
row["avg_first_token_latency_ms"] = avg_first_token_latency_ms
|
||||
row["avg_latency_ms"] = avg_latency_ms
|
||||
row["gpu_nums"] = gpu_nums
|
||||
row["gpu_mem(GiB)"] = avg_gpu_mem
|
||||
writer.writerow(row)
|
||||
print(
|
||||
f"input_len: {input_len}, output_len: {output_len}, parallel_num: {parallel_num}, save result to {output_file}"
|
||||
)
|
||||
|
||||
|
||||
async def run_model(wh: WorkerManager) -> None:
|
||||
global result_csv_file
|
||||
if not result_csv_file:
|
||||
result_csv_file = get_result_csv_file()
|
||||
if os.path.exists(result_csv_file):
|
||||
now = datetime.now()
|
||||
now_str = now.strftime("%Y-%m-%d")
|
||||
os.rename(result_csv_file, f"{result_csv_file}.bak_{now_str}.csv")
|
||||
for parallel_num in parallel_nums:
|
||||
for input_len, output_len in zip(input_lens, output_lens):
|
||||
try:
|
||||
await run_batch(
|
||||
wh, input_len, output_len, parallel_num, result_csv_file
|
||||
)
|
||||
except Exception:
|
||||
msg = traceback.format_exc()
|
||||
logging.error(
|
||||
f"Run benchmarks error, input_len: {input_len}, output_len: {output_len}, parallel_num: {parallel_num}, error message: {msg}"
|
||||
)
|
||||
if "torch.cuda.OutOfMemoryError" in msg:
|
||||
return
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def startup_llm_env():
|
||||
from fastapi import FastAPI
|
||||
|
||||
app = FastAPI()
|
||||
initialize_worker_manager_in_client(
|
||||
app=app,
|
||||
model_name=model_name,
|
||||
model_path=model_path,
|
||||
run_locally=False,
|
||||
controller_addr=controller_addr,
|
||||
local_port=6000,
|
||||
start_listener=run_model,
|
||||
)
|
||||
|
||||
|
||||
def connect_to_remote_model():
|
||||
startup_llm_env()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model_name", type=str, default=model_name)
|
||||
parser.add_argument("--model_path", type=str, default=None)
|
||||
parser.add_argument("--model_type", type=str, default="huggingface")
|
||||
parser.add_argument("--result_csv_file", type=str, default=None)
|
||||
parser.add_argument("--input_lens", type=str, default="8,8,256,1024")
|
||||
parser.add_argument("--output_lens", type=str, default="256,512,1024,1024")
|
||||
parser.add_argument("--parallel_nums", type=str, default="1,2,4,16,32")
|
||||
parser.add_argument(
|
||||
"--remote_model", type=bool, default=False, help="Connect to remote model"
|
||||
)
|
||||
parser.add_argument("--controller_addr", type=str, default="http://127.0.0.1:8000")
|
||||
parser.add_argument("--limit_model_concurrency", type=int, default=200)
|
||||
|
||||
args = parser.parse_args()
|
||||
print(f"args: {args}")
|
||||
model_name = args.model_name
|
||||
model_path = args.model_path or LLM_MODEL_CONFIG[model_name]
|
||||
result_csv_file = args.result_csv_file
|
||||
input_lens = [int(i) for i in args.input_lens.strip().split(",")]
|
||||
output_lens = [int(i) for i in args.output_lens.strip().split(",")]
|
||||
parallel_nums = [int(i) for i in args.parallel_nums.strip().split(",")]
|
||||
remote_model = args.remote_model
|
||||
controller_addr = args.controller_addr
|
||||
limit_model_concurrency = args.limit_model_concurrency
|
||||
model_type = args.model_type
|
||||
if len(input_lens) != len(output_lens):
|
||||
raise ValueError("input_lens size must equal output_lens size")
|
||||
|
||||
if remote_model:
|
||||
# Connect to remote model and run benchmarks
|
||||
connect_to_remote_model()
|
||||
else:
|
||||
# Start worker manager and run benchmarks
|
||||
run_worker_manager(
|
||||
model_name=model_name,
|
||||
model_path=model_path,
|
||||
start_listener=run_model,
|
||||
limit_model_concurrency=limit_model_concurrency,
|
||||
model_type=model_type,
|
||||
)
|
Reference in New Issue
Block a user