[inference] Refactor inference architecture (#5057)

* [inference] support only TP (#4998)

* support only tp

* enable tp

* add support for bloom (#5008)

* [refactor] refactor gptq and smoothquant llama (#5012)

* refactor gptq and smoothquant llama

* fix import error

* fix linear import torch-int

* fix smoothquant llama import error

* fix import accelerate error

* fix bug

* fix import smooth cuda

* fix smoothcuda

* [Inference Refactor] Merge chatglm2 with pp and tp (#5023)

merge chatglm with pp and tp

* [Refactor] remove useless inference code (#5022)

* remove useless code

* fix quant model

* fix test import bug

* mv original inference legacy

* fix chatglm2

* [Refactor] refactor policy search and quant type controlling in inference (#5035)

* [Refactor] refactor policy search and quant type controling in inference

* [inference] update readme (#5051)

* update readme

* update readme

* fix architecture

* fix table

* fix table

* [inference] udpate example (#5053)

* udpate example

* fix run.sh

* fix rebase bug

* fix some errors

* update readme

* add some features

* update interface

* update readme

* update benchmark

* add requirements-infer

---------

Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
This commit is contained in:
Xu Kai
2023-11-19 21:05:05 +08:00
committed by GitHub
parent bc09b95f50
commit fd6482ad8c
115 changed files with 6027 additions and 1431 deletions

View File

@@ -1,84 +0,0 @@
import argparse
import os
import time
import torch
from _utils import print_perf_stats
from transformers import BloomForCausalLM, BloomTokenizerFast
import colossalai
from colossalai.inference.tensor_parallel.engine import TPInferEngine
from colossalai.logging import disable_existing_loggers
from colossalai.shardformer import ShardConfig
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
def bench_bloom(args):
model_path = args.path
max_batch_size = args.batch_size
max_input_len = args.input_len
max_output_len = args.output_len
tokenizer = BloomTokenizerFast.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
model = BloomForCausalLM.from_pretrained(model_path, pad_token_id=tokenizer.eos_token_id)
model = model.half()
# init TPInferEngine and shard the original model
# To benchmark torch original, comment out the line of optimizing model
shard_config = ShardConfig(
enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
)
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
# prepare data for generation
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
input_tokens = {
"input_ids": torch.randint(10, 1000, (max_batch_size, max_input_len)),
"attention_mask": torch.ones((max_batch_size, max_input_len)),
}
for t in input_tokens:
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
print(f" input_tokens[{t}].shape: {input_tokens[t].shape}")
iters = 10
times = []
for i in range(iters):
torch.cuda.synchronize()
start = time.time()
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
torch.cuda.synchronize()
end = time.time()
out_len = outputs.shape[1]
print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s")
times.append((end - start) / (out_len - max_input_len))
print_perf_stats(times, model.config, max_batch_size)
def check_bloom(rank, world_size, port, args):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
bench_bloom(args)
@rerun_if_address_is_in_use()
@clear_cache_before_run()
def test_bloom(args):
spawn(check_bloom, args.tp_size, args=args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
args = parser.parse_args()
test_bloom(args)

View File

@@ -1,118 +0,0 @@
import argparse
import os
import time
import torch
from _utils import print_perf_stats
from transformers import AutoTokenizer
import colossalai
from colossalai.inference.tensor_parallel.engine import TPInferEngine
from colossalai.logging import disable_existing_loggers
from colossalai.shardformer import ShardConfig
from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration
from colossalai.testing import rerun_if_address_is_in_use, spawn
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
def run_chatglm2_test(args):
chatglm2_model_path = args.path
max_batch_size = args.batch_size
max_input_len = args.input_len
max_output_len = args.output_len
args.test_mode
print("max_batch_size : " + str(max_batch_size))
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
model = ChatGLMForConditionalGeneration.from_pretrained(chatglm2_model_path, pad_token_id=tokenizer.eos_token_id)
model = model.half()
model.config
shard_config = ShardConfig(
enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
)
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
generate_kwargs = dict(max_new_tokens=1, do_sample=False)
input_tokens = {
"input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
}
iters = 10
prefill_times = []
warmup = 3
for i in range(iters):
torch.cuda.synchronize()
start = time.time()
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
torch.cuda.synchronize()
end = time.time()
out_len = outputs.shape[1]
print("generation time {} s".format(str(end - start)))
print(out_len - max_input_len)
prefill_times.append((end - start) / (out_len - max_input_len))
prefill_times = prefill_times[warmup:]
prefill_time_avg = sum(prefill_times) / len(prefill_times)
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
times = []
decoder_times = []
for i in range(iters):
torch.cuda.synchronize()
start = time.time()
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
torch.cuda.synchronize()
end = time.time()
out_len = outputs.shape[1]
print("generation time {} s".format(str(end - start)))
print(out_len - max_input_len)
times.append((end - start) / (out_len - max_input_len))
if args.test_mode == "decoder_test":
decoder_times.append((end - start - prefill_time_avg) / (out_len - max_input_len - 1))
times = times[warmup:]
latency = sum(times) / len(times)
print("total process latency is : " + str(latency) + " s")
print("total throughput is : " + str(1 / latency * max_batch_size))
if args.test_mode == "decoder_test":
decoder_times = decoder_times[warmup:]
latency = sum(decoder_times) / len(decoder_times)
print("decoder process latency is : " + str(latency) + " s")
print("decoder throughput is : " + str(1 / latency * max_batch_size))
print_perf_stats(times, model.config, max_batch_size)
def check_chatglm2(rank, world_size, port, args):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
run_chatglm2_test(args)
@rerun_if_address_is_in_use()
def test_chatglm2(args):
spawn(check_chatglm2, args.tp_size, args=args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
parser.add_argument("--input_len", type=int, default=256, help="Maximum input length")
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
parser.add_argument(
"--test_mode", type=str, help="Test mode", default="e2e_test", choices=["e2e_test", "decoder_test"]
)
args = parser.parse_args()
test_chatglm2(args)

View File

@@ -1,118 +0,0 @@
import argparse
import os
import time
import torch
from _utils import print_perf_stats
from transformers import LlamaForCausalLM, LlamaTokenizer
import colossalai
from colossalai.inference.tensor_parallel.engine import TPInferEngine
from colossalai.logging import disable_existing_loggers
from colossalai.shardformer import ShardConfig
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
def run_llama_test(args):
llama_model_path = args.path
max_batch_size = args.batch_size
max_input_len = args.input_len
max_output_len = args.output_len
args.test_mode
print("max_batch_size : " + str(max_batch_size))
tokenizer = LlamaTokenizer.from_pretrained(llama_model_path)
tokenizer.pad_token_id = tokenizer.unk_token_id
model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.eos_token_id)
model = model.half()
shard_config = ShardConfig(
enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
)
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
generate_kwargs = dict(max_new_tokens=1, do_sample=False)
input_tokens = {
"input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
}
iters = 10
prefill_times = []
warmup = 3
for i in range(iters):
torch.cuda.synchronize()
start = time.time()
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
torch.cuda.synchronize()
end = time.time()
out_len = outputs.shape[1]
print("generation time {} s".format(str(end - start)))
print(out_len - max_input_len)
prefill_times.append((end - start) / (out_len - max_input_len))
prefill_times = prefill_times[warmup:]
prefill_time_avg = sum(prefill_times) / len(prefill_times)
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
times = []
decoder_times = []
for i in range(iters):
torch.cuda.synchronize()
start = time.time()
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
torch.cuda.synchronize()
end = time.time()
out_len = outputs.shape[1]
print("generation time {} s".format(str(end - start)))
print(out_len - max_input_len)
times.append((end - start) / (out_len - max_input_len))
if args.test_mode == "decoder_test":
decoder_times.append((end - start - prefill_time_avg) / (out_len - max_input_len - 1))
times = times[warmup:]
latency = sum(times) / len(times)
print("total process latency is : " + str(latency) + " s")
print("total throughput is : " + str(1 / latency * max_batch_size))
if args.test_mode == "decoder_test":
decoder_times = decoder_times[warmup:]
latency = sum(decoder_times) / len(decoder_times)
print("decoder process latency is : " + str(latency) + " s")
print("decoder throughput is : " + str(1 / latency * max_batch_size))
print_perf_stats(times, model.config, max_batch_size)
def check_llama(rank, world_size, port, args):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
run_llama_test(args)
@rerun_if_address_is_in_use()
@clear_cache_before_run()
def test_llama(args):
spawn(check_llama, args.tp_size, args=args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
parser.add_argument("-b", "--batch_size", type=int, default=32, help="Maximum batch size")
parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
parser.add_argument(
"--test_mode", type=str, help="Test mode", default="e2e_test", choices=["e2e_test", "decoder_test"]
)
args = parser.parse_args()
test_llama(args)

View File

@@ -0,0 +1,151 @@
import argparse
import os
import time
import torch
import torch.distributed as dist
import transformers
import colossalai
from colossalai.inference import CaiInferEngine
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
GIGABYTE = 1024**3
MEGABYTE = 1024 * 1024
def data_gen(batch_size: int = 4, seq_len: int = 512):
input_ids = torch.randint(10, 30000, (1, seq_len), dtype=torch.int32)
attention_mask = torch.ones((1, seq_len), dtype=torch.int32)
data = dict(input_ids=input_ids, attention_mask=attention_mask)
for k, v in data.items():
if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
new_shape = [1] * v.dim()
new_shape[0] = batch_size
data[k] = v.to("cuda").repeat(*new_shape)
return data
def print_details_info(timestamps, model_config, args, whole_end2end):
log_file_name = f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.output_len}_bsz{args.batch_size}_mbsz{args.mb_size}.log"
os.makedirs(os.path.dirname(log_file_name), exist_ok=True)
if dist.get_rank() == 0:
prefill = []
encoder = []
end2end = []
for timestamp in timestamps:
prefill.append(timestamp[1] - timestamp[0])
encoder.append(
sum(timestamp[i + 1] - timestamp[i] for i in range(1, len(timestamp) - 1)) / (len(timestamp) - 2)
)
end2end.append(timestamp[-1] - timestamp[0])
print(whole_end2end)
with open(
log_file_name,
"w+",
) as f:
mb_avg_end2end = sum(end2end) / len(end2end)
mb_avg_latency = mb_avg_end2end / (args.output_len * args.mb_size)
whole_avg_latency = whole_end2end / (args.output_len * args.batch_size)
num_layers = getattr(model_config, "num_layers", model_config.num_hidden_layers)
num_parameters = num_layers * model_config.hidden_size * model_config.hidden_size * 12 / args.pp_size
if args.dtype in ["fp16", "bf16"]:
num_bytes = 2
else:
num_bytes = 4
f.write(
f"llama-{args.model}{args.dtype}_pp{args.pp_size}, input_len:{args.seq_len}, output_len:{args.output_len}, bsz:{args.batch_size}, mbsz:{args.mb_size}\n"
)
f.write("Average prefill time: {0:8.2f} ms\n".format(sum(prefill) / len(prefill) * 1000))
f.write("Average encode time: {0:8.2f} ms\n".format(sum(encoder) / len(encoder) * 1000))
f.write("Average micro batch end2end time: {0:8.2f} ms\n".format(mb_avg_end2end * 1000))
f.write("Average micro batch Per Token Latency: {0:8.2f} ms\n".format(mb_avg_latency * 1000))
f.write("Whole batch end2end time: {0:8.2f} ms\n".format(whole_end2end * 1000))
f.write("Whole batch Per Token Latency: {0:8.2f} ms\n".format(whole_avg_latency * 1000))
f.write("Throughput: {} tokens/s\n".format((1000 / (whole_avg_latency * 1000))))
f.write("flops: {0:8.2f} TFlops/s\n".format(1 / whole_avg_latency * num_parameters * num_bytes / 1e12))
f.write("----------------------------------------------------------\n")
if torch.cuda.is_available():
current_device = torch.cuda.current_device()
# free memory and the total available memory in bytes
global_free_memory, total_GPU_memory_occupied = torch.cuda.mem_get_info()
memory_allocated = torch.cuda.memory_allocated()
max_memory_allocated = torch.cuda.max_memory_allocated()
memory_reserved = torch.cuda.memory_reserved()
max_memory_reserved = torch.cuda.max_memory_reserved()
with open(
log_file_name,
"a",
) as f:
f.write(
f"\nCurrently using GPU: {current_device}\n"
f"free memory : {global_free_memory / GIGABYTE:.4f} GB,\n"
f"total memory: {total_GPU_memory_occupied / GIGABYTE:.4f} GB,\n"
f"memory allocated: {memory_allocated / GIGABYTE:.4f} GB,\n"
f"Max CUDA memory allocated: {max_memory_allocated / GIGABYTE:.4f} GB,\n"
f"memory reserved/cached: {memory_reserved / GIGABYTE:.4f} GB,\n"
f"Max CUDA memory reserved/cached: {max_memory_reserved / GIGABYTE:.4f} GB,\n"
)
def benchmark_inference(args):
if args.model == "toy":
model = transformers.LlamaForCausalLM(transformers.LlamaConfig(num_hidden_layers=8))
elif args.model == "7b":
model = transformers.LlamaForCausalLM(transformers.AutoConfig.from_pretrained("decapoda-research/llama-7b-hf"))
elif args.model == "13b":
model = transformers.LlamaForCausalLM(transformers.AutoConfig.from_pretrained("decapoda-research/llama-13b-hf"))
else:
raise NotImplementedError
engine = CaiInferEngine(
pp_size=args.pp_size,
tp_size=args.tp_size,
dtype=args.dtype,
micro_batch_size=args.mb_size,
model=model,
verbose=True,
max_batch_size=args.mb_size,
max_input_len=args.seq_len,
max_output_len=args.output_len,
)
data = data_gen(args.batch_size, args.seq_len)
torch.cuda.synchronize()
whole_end2end = time.time()
output, timestamps = engine.generate(data)
torch.cuda.synchronize()
whole_end2end = time.time() - whole_end2end
print_details_info(timestamps, model.config, args, whole_end2end)
def hybrid_inference(rank, world_size, port, args):
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
benchmark_inference(args)
@rerun_if_address_is_in_use()
@clear_cache_before_run()
def benchmark(args):
spawn(hybrid_inference, nprocs=args.tp_size * args.pp_size, args=args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="toy", help="the size of model")
parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size")
parser.add_argument("-s", "--seq_len", type=int, default=8, help="sequence length")
parser.add_argument("--mb_size", type=int, default=1, help="micro_batch_size")
parser.add_argument("--pp_size", type=int, default=2, help="pipeline size")
parser.add_argument("--tp_size", type=int, default=2, help="pipeline size")
parser.add_argument("--output_len", type=int, default=16, help="Output length")
parser.add_argument("--log_path", type=str, default="./log", help="where to store the benchmark log")
parser.add_argument("--dtype", type=str, default="fp16", help="data type")
args = parser.parse_args()
benchmark(args)

View File

@@ -1,81 +0,0 @@
import os
import warnings
import torch
import torch.distributed as dist
import argparse
from packaging import version
import colossalai
from colossalai.inference.tensor_parallel.engine import TPInferEngine
from colossalai.logging import disable_existing_loggers
from colossalai.shardformer import ShardConfig
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
from transformers import AutoModelForCausalLM, AutoTokenizer
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
TPSIZE = 1
BATCH_SIZE = 4
MAX_INPUT_LEN = 32
MAX_OUTPUT_LEN = 128
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.5')
@parameterize('test_config', [{
'tp_size': TPSIZE,
}])
def run_llama_test(test_config, args):
model_path = args.path
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.unk_token_id
model = AutoModelForCausalLM.from_pretrained(model_path, pad_token_id=tokenizer.eos_token_id)
model = model.half()
text = ["Introduce London.", "What is the genus of Poodle?"]
input_ids = tokenizer.batch_encode_plus(text, return_tensors='pt', padding=True)
print(input_ids)
shard_config = ShardConfig(enable_tensor_parallelism=True if test_config['tp_size'] > 1 else False,
extra_kwargs={"inference_only": True})
infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
outputs = infer_engine.generate(input_ids, **generate_kwargs)
assert outputs is not None
if not dist.is_initialized() or dist.get_rank() == 0:
for o in outputs:
output_text = tokenizer.decode(o)
print(output_text)
def check_llama(rank, world_size, port, args):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
run_llama_test(args=args)
@rerun_if_address_is_in_use()
@clear_cache_before_run()
def test_llama(args):
spawn(check_llama, args.tp_size, args=args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--path", type=str, default = "hpcai-tech/Colossal-LLaMA-2-7b-base", help="Model path")
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
parser.add_argument("-b", "--batch_size", type=int, default=32, help="Maximum batch size")
parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
parser.add_argument(
"--test_mode", type=str, help="Test mode", default="e2e_test", choices=["e2e_test", "decoder_test"]
)
args = parser.parse_args()
test_llama(args)

View File

@@ -1,105 +0,0 @@
import argparse
import os
import time
import torch
from _utils import print_perf_stats
from auto_gptq import AutoGPTQForCausalLM
from transformers import BloomTokenizerFast
import colossalai
from colossalai.inference.tensor_parallel.engine import TPInferEngine
from colossalai.logging import disable_existing_loggers
from colossalai.shardformer import ShardConfig
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
def bench_bloom(args):
pretrained_model_dir = args.path
quantized_model_dir = args.quantized_path
max_batch_size = args.batch_size
max_input_len = args.input_len
max_output_len = args.output_len
tokenizer = BloomTokenizerFast.from_pretrained(pretrained_model_dir)
tokenizer.pad_token = tokenizer.eos_token
# load quantized model to the first GPU
model = AutoGPTQForCausalLM.from_quantized(
quantized_model_dir, device=torch.cuda.current_device(), inject_fused_attention=False
)
model = model.half()
model_config = model.config
shard_config = ShardConfig(
enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
)
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
input_tokens = {
"input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
}
# init TPInferEngine and shard the original model
# To benchmark torch original, comment out the line of optimizing model
shard_config = ShardConfig(
enable_tensor_parallelism=True if args.tp_size > 1 else False,
extra_kwargs={"inference_only": True, "inference_gptq": True},
)
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
# prepare data for generation
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
input_tokens = {
"input_ids": torch.randint(10, 1000, (max_batch_size, max_input_len)),
"attention_mask": torch.ones((max_batch_size, max_input_len)),
}
for t in input_tokens:
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
# print(f" input_tokens[{t}].shape: {input_tokens[t].shape}")
iters = 10
times = []
for i in range(iters):
torch.cuda.synchronize()
start = time.time()
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
torch.cuda.synchronize()
end = time.time()
out_len = outputs.shape[1]
print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s")
times.append((end - start) / (out_len - max_input_len))
print_perf_stats(times, model_config, max_batch_size)
def check_bloom(rank, world_size, port, args):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
bench_bloom(args)
@rerun_if_address_is_in_use()
@clear_cache_before_run()
def test_bloom(args):
spawn(check_bloom, args.tp_size, args=args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
args = parser.parse_args()
test_bloom(args)

View File

@@ -1,87 +0,0 @@
import argparse
import os
import time
import torch
from _utils import print_perf_stats
from auto_gptq import AutoGPTQForCausalLM
from transformers import LlamaTokenizer
import colossalai
from colossalai.inference.tensor_parallel.engine import TPInferEngine
from colossalai.logging import disable_existing_loggers
from colossalai.shardformer import ShardConfig
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
def run_llama_test(args):
pretrained_model_dir = args.path
quantized_model_dir = args.quantized_path
max_batch_size = args.batch_size
max_input_len = args.input_len
max_output_len = args.output_len
tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
# load quantized model to the first GPU
model = AutoGPTQForCausalLM.from_quantized(
quantized_model_dir, device=torch.cuda.current_device(), inject_fused_attention=False
)
model_config = model.config
shard_config = ShardConfig(
enable_tensor_parallelism=True if args.tp_size > 1 else False,
extra_kwargs={"inference_only": True, "inference_gptq": True},
)
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
input_tokens = {
"input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
}
iters = 10
times = []
for i in range(iters):
torch.cuda.synchronize()
start = time.time()
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
torch.cuda.synchronize()
end = time.time()
out_len = outputs.shape[1]
print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s")
times.append((end - start) / (out_len - max_input_len))
print_perf_stats(times, model_config, max_batch_size)
def check_llama(rank, world_size, port, args):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
run_llama_test(args)
@rerun_if_address_is_in_use()
@clear_cache_before_run()
def test_llama(args):
spawn(check_llama, args.tp_size, args=args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
args = parser.parse_args()
test_llama(args)

View File

@@ -0,0 +1,72 @@
import argparse
import os
import torch
import torch.distributed as dist
from auto_gptq import AutoGPTQForCausalLM
import colossalai
from colossalai.inference import CaiInferEngine
from colossalai.logging import disable_existing_loggers
from colossalai.testing import spawn
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
def run_llama_inference(args):
quantized_model_dir = args.quantized_path
max_batch_size = args.max_batch_size
max_input_len = args.max_input_len
max_output_len = args.max_output_len
micro_batch_size = args.micro_batch_size
# load quantized model to the first GPU
model = AutoGPTQForCausalLM.from_quantized(
quantized_model_dir, inject_fused_attention=False, device=torch.cuda.current_device()
)
engine = CaiInferEngine(
tp_size=2,
pp_size=2,
model=model,
max_batch_size=max_batch_size,
max_input_len=max_input_len,
max_output_len=max_output_len,
micro_batch_size=micro_batch_size,
quant="gptq",
)
def data_gen():
input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
return dict(input_ids=input_ids, attention_mask=attention_mask)
inputs = data_gen()
for k, v in inputs.items():
if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
new_shape = [1] * v.dim()
new_shape[0] = 16
inputs[k] = v.to("cuda").repeat(*new_shape)
output = engine.generate(inputs)
if dist.get_rank() == 0:
assert len(output[0]) == max_output_len, f"{len(output)}, {max_output_len}"
def run_gptq_infernece(rank, world_size, port, args):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
run_llama_inference(args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
parser.add_argument("--tp_size", type=int, default=2, help="Tensor parallel size")
parser.add_argument("--pp_size", type=int, default=2, help="Pipeline parallel size")
parser.add_argument("--max_batch_size", type=int, default=4, help="Maximum batch size")
parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size")
parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
parser.add_argument("--max_output_len", type=int, default=32, help="Maximum output length")
args = parser.parse_args()
spawn(run_gptq_infernece, args.tp_size * args.pp_size, args=args)

View File

@@ -0,0 +1,86 @@
import argparse
import time
import torch
import torch.distributed as dist
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer
import colossalai
from colossalai.inference import CaiInferEngine
from colossalai.testing import spawn
def run_inference(args):
llama_model_path = args.path
max_input_len = args.max_input_len
max_output_len = args.max_output_len
max_batch_size = args.batch_size
micro_batch_size = args.micro_batch_size
tp_size = args.tp_size
pp_size = args.pp_size
rank = dist.get_rank()
tokenizer = LlamaTokenizer.from_pretrained(llama_model_path)
tokenizer.pad_token_id = tokenizer.unk_token_id
model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.eos_token_id)
model = model.half()
model = transformers.LlamaForCausalLM(
transformers.LlamaConfig(
vocab_size=20000, hidden_size=512, intermediate_size=1536, num_attention_heads=4, num_hidden_layers=4
)
)
engine = CaiInferEngine(
tp_size=tp_size,
pp_size=pp_size,
model=model,
max_output_len=max_output_len,
micro_batch_size=micro_batch_size,
)
input_tokens = {
"input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
}
iters = 10
warmup = 3
times = []
for i in range(iters):
torch.cuda.synchronize()
start = time.time()
outputs = engine.generate(input_tokens)
torch.cuda.synchronize()
end = time.time()
if rank == 0:
out_len = len(outputs[0])
print("generation time {} s".format(str(end - start)))
print(out_len)
times.append((end - start) / out_len)
if rank == 0:
times = times[warmup:]
latency = sum(times) / len(times)
print("total process latency is : " + str(latency) + " s")
print("total throughput is : " + str(1 / latency * max_batch_size))
def run_tp_pipeline_inference(rank, world_size, port, args):
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
run_inference(args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
parser.add_argument("-tp", "--tp_size", type=int, default=2, help="Tensor parallel size")
parser.add_argument("-pp", "--pp_size", type=int, default=2, help="Tensor parallel size")
parser.add_argument("-b", "--batch_size", type=int, default=8, help="Maximum batch size")
parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
parser.add_argument("--max_output_len", type=int, default=16, help="Maximum output length")
parser.add_argument("--micro_batch_size", type=int, default=2, help="Micro batch size")
args = parser.parse_args()
spawn(run_tp_pipeline_inference, nprocs=args.tp_size * args.pp_size, args=args)

View File

@@ -0,0 +1,69 @@
import argparse
import torch
import torch.distributed as dist
import colossalai
from colossalai.inference import CaiInferEngine
from colossalai.inference.quant.smoothquant.models.llama import SmoothLlamaForCausalLM
from colossalai.logging import disable_existing_loggers
from colossalai.testing import spawn
@torch.no_grad()
def run_llama_inference(args):
quantized_model_dir = args.quantized_path
max_batch_size = args.max_batch_size
max_input_len = args.max_input_len
max_output_len = args.max_output_len
micro_batch_size = args.micro_batch_size
def data_gen():
input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
return dict(input_ids=input_ids, attention_mask=attention_mask)
inputs = data_gen()
for k, v in inputs.items():
if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
new_shape = [1] * v.dim()
new_shape[0] = 16
inputs[k] = v.to("cuda").repeat(*new_shape)
model = SmoothLlamaForCausalLM.from_quantized(quantized_model_dir, model_basename="llama-7b")
model = model.cuda()
engine = CaiInferEngine(
tp_size=2,
pp_size=2,
model=model,
max_batch_size=max_batch_size,
max_input_len=max_input_len,
max_output_len=max_output_len,
micro_batch_size=micro_batch_size,
quant="smoothquant",
)
output = engine.generate(inputs)
if dist.get_rank() == 0:
assert len(output[0]) == 32, f"{len(output)}, {32}"
def run_smoothquant_inference(rank, world_size, port, args):
disable_existing_loggers()
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
run_llama_inference(args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
parser.add_argument("--tp_size", type=int, default=2, help="Tensor parallel size")
parser.add_argument("--pp_size", type=int, default=2, help="Pipeline parallel size")
parser.add_argument("--max_batch_size", type=int, default=4, help="Maximum batch size")
parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size")
parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
parser.add_argument("--max_output_len", type=int, default=32, help="Maximum output length")
args = parser.parse_args()
spawn(run_smoothquant_inference, args.tp_size * args.pp_size, args=args)

View File

@@ -0,0 +1,55 @@
script_dir=$(cd "$(dirname "$0")" && pwd)
cd "${script_dir}"
# 7b, fp16, 2 gpu, 1024, 128
for BATCH_SIZE in 2 4 8 16; do
python ./benchmark.py \
--model="7b" \
--dtype="fp16" \
--batch_size=${BATCH_SIZE} \
--seq_len=1024 \
--new_length=128 \
--mb_size=$((${BATCH_SIZE}/2)) \
--pp_size=2 \
--tp_size=2
done
# 7b, fp16, 2 gpu, 512, 512
for BATCH_SIZE in 2 4 8 16 32; do
python ./benchmark.py \
--model="7b" \
--dtype="fp16" \
--batch_size=${BATCH_SIZE} \
--seq_len=512 \
--new_length=512 \
--mb_size=$((${BATCH_SIZE}/2)) \
--pp_size=2 \
--tp_size=2
done
# 7b, fp16, 2 gpu, 1024, 128
for BATCH_SIZE in 2 4 8; do
python ./benchmark.py \
--model="13b" \
--dtype="fp16" \
--batch_size=${BATCH_SIZE} \
--seq_len=1024 \
--new_length=128 \
--mb_size=$((${BATCH_SIZE}/2)) \
--pp_size=2 \
--tp_size=2
done
# 13b, fp16, 2 gpu, 512, 512
for BATCH_SIZE in 2 4 8 16; do
python ./benchmark.py \
--model="13b" \
--dtype="fp16" \
--batch_size=${BATCH_SIZE} \
--seq_len=512 \
--new_length=512 \
--mb_size=$((${BATCH_SIZE}/2)) \
--pp_size=2 \
--tp_size=2
done

View File

@@ -1,153 +0,0 @@
import logging
import os
from typing import Any, List, Union
import ray
import ray.util.collective as collective
import starlette
import torch
from pydantic import BaseModel
from ray import serve
from ray.serve import Application
from transformers import AutoModelForCausalLM, AutoTokenizer
import colossalai
from colossalai.inference.tensor_parallel.engine import TPInferEngine
from colossalai.shardformer import ShardConfig
from colossalai.testing import free_port
ray_serve_logger = logging.getLogger("ray.serve")
class GenConfigArgs(BaseModel):
"""Config for generation"""
path: str
tp_size: int = 2
max_batch_size: int = 4
max_input_len: int = 128
max_output_len: int = 32
def log_cuda_info(scope_name: str):
ray_serve_logger.info(f" {scope_name}: ray.get_gpu_ids(): {ray.get_gpu_ids()}")
ray_serve_logger.info(
f" {scope_name}: CUDA_VISIBLE_DEVICES: {os.getenv('CUDA_VISIBLE_DEVICES', 'NO DEVICES FOUND!')}"
)
if torch.cuda.is_available():
ray_serve_logger.info(
f" {scope_name}: cuda current_device: {torch.cuda.current_device()}, cuda device count: {torch.cuda.device_count()}"
)
else:
ray_serve_logger.info(f" {scope_name}: cuda is not available!")
@ray.remote(num_gpus=1)
class Worker:
def __init__(self, model_path: str, tp_size: int, max_batch_size: int, max_input_len: int, max_output_len: int):
log_cuda_info("Worker.init")
self.tp_size = tp_size
self.model_path = model_path
self.max_batch_size = max_batch_size
self.max_input_len = max_input_len
self.max_output_len = max_output_len
def setup(self, world_size, rank, port):
# initialize a ray collective group, otherwise colossalai distributed env won't be built successfully
collective.init_collective_group(world_size, rank, "nccl", "default")
# initialize and set distributed environment
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")
log_cuda_info("Worker.setup")
# Load model
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path, pad_token_id=self.tokenizer.pad_token_id, torch_dtype=torch.float16
)
shard_config = ShardConfig(
enable_tensor_parallelism=True if world_size > 1 else False, extra_kwargs={"inference_only": True}
)
self.infer_engine = TPInferEngine(
self.model, shard_config, self.max_batch_size, self.max_input_len, self.max_output_len
)
self.generate_kwargs = dict(max_new_tokens=self.max_output_len, do_sample=False)
return True
def generate(self, text: Union[str, List[str]]) -> str:
input_tokens = self.tokenizer.batch_encode_plus(text, return_tensors="pt", padding=True)
ray_serve_logger.info(f"text: {text},\ninput_tokens: {input_tokens}")
model_output = self.infer_engine.generate(input_tokens, **self.generate_kwargs)
ray_serve_logger.info(f"model_output.shape: {model_output.shape}")
text_output = []
for i in range(len(model_output)):
text_output.append(self.tokenizer.decode(model_output[i]))
ray_serve_logger.info(f"output: {text_output}")
return text_output
@serve.deployment(
ray_actor_options={"num_cpus": 1, "num_gpus": 0},
max_concurrent_queries=5,
autoscaling_config={
"target_num_ongoing_requests_per_replica": 1,
"min_replicas": 1,
"initial_replicas": 1,
"max_replicas": 1,
},
)
class Driver:
def __init__(self, config: GenConfigArgs):
log_cuda_info("Driver:init")
model_path = config.path
tp_size = config.tp_size
self.num_workers = tp_size
self.workers = []
init_rets = []
# Just grab a free port on localhost
# NOTE workers in this communication group listen to the same port
available_port = free_port()
for i in range(self.num_workers):
worker_name = "worker_idx_{}".format(i)
w = Worker.options(name=worker_name).remote(
model_path, self.num_workers, config.max_batch_size, config.max_input_len, config.max_output_len
)
self.workers.append(w)
init_rets.append(w.setup.remote(self.num_workers, i, available_port))
_options = {
"group_name": "default_driver",
"world_size": self.num_workers,
"ranks": [i for i in range(self.num_workers)],
"backend": "nccl",
}
collective.create_collective_group(self.workers, **_options)
_ = ray.get(init_rets)
# set batch wait delay in seconds and maximum number of sequences in a batch
@serve.batch(batch_wait_timeout_s=0.8, max_batch_size=4)
async def batch_generate(self, requests: List[str]):
ray_serve_logger.info(f"Driver.batch_generate: requests length: {len(requests)}\n requests: {requests}")
results = ray.get([w.generate.remote(requests) for w in self.workers])
text_res = results[0] # get any one of the copies
return text_res
async def __call__(self, request: starlette.requests.Request) -> Any:
return await self.batch_generate(request.query_params["text"])
def app(args: GenConfigArgs) -> Application:
print(args)
if args.path is None or not os.path.exists(args.path):
raise ValueError("Model path not provided or invalid path!")
return Driver.options(name="Colossal-Inference-Driver").bind(config=args)

View File

@@ -1,86 +0,0 @@
# Colossal-Inference with Ray Serve
This example is used for demonstrating and testing the deployment of Colossal Inference from `colossalai.inference` with [Ray Serve](https://docs.ray.io/en/latest/serve/index.html). It imports inference modules from colossalai and is based on https://github.com/hpcaitech/ColossalAI/tree/a22706337a57dd1c98b95739dd09d98bd55947a0.
Single-gpu inference as well as multiple-gpu inference (i.e. tensor parallel) serving are supported.
## Installation
### Conda Environment
```bash
# create a new conda env with python 3.8
conda create -n ray_test python=3.8.18
# use torch1.13+cuda11.6
pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
# install ray from wheels
pip install -U "ray[default,serve]"
# install cuda toolkit (e.g. nvcc, etc)
conda install -c "nvidia/label/cuda-11.6.2" cuda-toolkit
# install cuDNN, cuTENSOR, and NCCL
conda install -c conda-forge cupy cudnn cutensor nccl cuda-version=11.6
# install colossalai with PyTorch extensions
cd <path_to_ColossalAI_repo>
CUDA_EXT=1 pip install -e .
# install other dependencies
pip install triton==2.0.0.dev20221202
pip install transformers
```
## Launch Ray Serve and run the app
### Method #1. CLI command
Under the current directory, we could launch the app by the following command:
```bash
RAY_DEDUP_LOGS=0 serve run Colossal_Inference_rayserve:app path="PATH_TO_YOUR_MODEL_DIR"
```
By default, Ray deduplicates logs across cluster. Here we set `RAY_DEDUP_LOGS=0` to disable log deduplication, enabling each actor to log information in CLI. `serve run` runs an application from the specified import path. The formats should be `<filename>:<app_name>`.
Then we could send requests by running python script in another window:
```bash
python send_request.py
```
### Method #2. Run inside script
We could also launch ray serve and run the app inside a single script by making some modifications:
To avoid ray handler from raising error in serializing pydantic objects, we'll replace the config class from `class GenConfigArgs(BaseModel)` to
```python
from dataclasses import dataclass
@dataclass
class GenConfigArgs:
# attributes remain unchanged
```
Comment out the app builder
```python
# def app(args: GenConfigArgs) -> Application:
# ...
# return Driver.options(name="Colossal-Inference-Driver").bind(config=args)
```
And attach the following lines to the end of the file,
```python
from ray.serve.handle import DeploymentHandle, DeploymentResponse
app = Driver.bind(config=GenConfigArgs(path="<Path_to_model_dir>"))
handle: DeploymentHandle = serve.run(app).options(use_new_handle_api=True)
response: DeploymentResponse = handle.batch_generate.remote(requests="Introduce some landmarks in Beijing")
print(response.result())
```
Then we could run the script
```python
python Colossal_Inference_rayserve.py
```
### Terminate Ray Serve
Ray serve and the application would terminate automatically as you choose the second method to run any job in the script. If you choose the first method (serve run), you might want to apply `ctrl+c` to shut down the application, or use `serve shutdown` to shut down serve and deletes all applications on the ray cluster.
To make sure all the active Ray processes are killed, run
```bash
ray stop
```

View File

@@ -1,15 +0,0 @@
import ray
import requests
@ray.remote
def send_query(text):
resp = requests.get("http://localhost:8000/?text={}".format(text))
return resp.text
test_sentence = "Introduce some landmarks in Beijing"
result = ray.get(send_query.remote(test_sentence))
print("Result returned:")
print(result)

View File

@@ -1,27 +0,0 @@
import ray
import requests
@ray.remote
def send_query(text):
resp = requests.get("http://localhost:8000/?text={}".format(text))
return resp.text
test_sentences = [
"Introduce some landmarks in Beijing",
"What is the weather today",
"Coding requires practice and patience",
"Rainy days inspire cozy reading",
"Laughter is contagious and heartwarming",
"Hiking mountains builds strength and resilience",
"Family bonds grow stronger with time",
"Science unlocks mysteries of the universe",
"Music soothes the soul and ignites passion",
"Artistic expression knows no boundaries",
]
results = ray.get([send_query.remote(text) for text in test_sentences])
print("Result returned:")
for res in results:
print(res)

View File

@@ -1,195 +0,0 @@
import logging
import os
import zipfile
from abc import ABC
import torch
import transformers
from transformers import AutoTokenizer, BloomForCausalLM, BloomTokenizerFast, LlamaForCausalLM
from ts.torch_handler.base_handler import BaseHandler
import colossalai
from colossalai.inference.tensor_parallel.engine import TPInferEngine
from colossalai.shardformer import ShardConfig
from colossalai.testing import free_port
logger = logging.getLogger(__name__)
logger.info("Transformers version %s", transformers.__version__)
logger.info("ColossalAI version %s", colossalai.__version__)
class ColossalInferenceHandler(BaseHandler, ABC):
"""
Transformers handler class for testing
"""
def __init__(self):
super(ColossalInferenceHandler, self).__init__()
self.infer_engine = None
self.max_batch_size = None
self.max_input_len = None
self.max_output_len = None
self.tokenizer = None
self.initialized = False
def initialize(self, ctx):
"""Expected behaviour: the sharded Bloom/Llama model is loaded.
Args:
ctx (context): It is a JSON Object containing information
pertaining to the model artefacts parameters.
"""
if ctx is not None or not hasattr(ctx, "model_yaml_config"):
logger.error("Context ctx and model-config are not appropriately passed in.")
self.manifest = ctx.manifest
gpu_id = ctx.system_properties.get("gpu_id", -1)
model_dir = ctx.system_properties.get("model_dir")
# Inference configs are collected together in model yaml config for handler use
inference_config = ctx.model_yaml_config["handler"]
self.inference_config = inference_config
logger.info(self.inference_config)
self.tp_size = self.inference_config.get("tp_size", 1)
self.max_batch_size = self.inference_config.get("max_batch_size", 4)
self.max_input_len = self.inference_config.get("max_input_len", 1024)
self.max_output_len = self.inference_config.get("max_output_len", 128)
self.device = torch.device("cuda:" + str(gpu_id) if torch.cuda.is_available() and gpu_id >= 0 else "cpu")
logger.info(f"Device set to {self.device}")
logger.info(f"torch.cuda.device_count() {torch.cuda.device_count()}")
# Unpacking from model_dir
model_dir_path = os.path.join(model_dir, "model")
with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref:
zip_ref.extractall(model_dir_path)
logger.info(f"Loading {self.inference_config['model_type']} pretrain model and tokenizer")
if self.inference_config["model_type"] == "bloom":
self.model = BloomForCausalLM.from_pretrained(
model_dir_path,
)
self.tokenizer = BloomTokenizerFast.from_pretrained(model_dir_path, return_tensors="pt")
elif self.inference_config["model_type"] == "llama":
self.model = LlamaForCausalLM.from_pretrained(
model_dir_path,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_dir_path, return_tensors="pt")
else:
logger.warning(f"Model type {self.inference_config['model_type']} not supported yet.")
logger.info("Transformer model from path %s loaded successfully", model_dir)
# NOTE world_size, rank, host, port here are used to launch colossalai dist environment
# This world_size is different from the world size of TorchServe
world_size = int(os.getenv("WORLD_SIZE", self.tp_size))
assert world_size == 1, "Colossal-Inference with tensor parallel is not supported on TorchServe for now"
rank = int(os.getenv("RANK", gpu_id))
local_rank = int(os.getenv("LOCAL_RANK", gpu_id))
host = os.getenv("MASTER_ADDR", "localhost")
port = os.getenv("MASTER_PORT", free_port()) # use a random free port
logger.info(
f" world_size {world_size}" f" local_rank {local_rank}" f" rank {rank}" f" host {host}" f" port {port}"
)
torch.cuda.set_device(self.device)
self.model.half()
self.model.cuda()
self.model.eval()
colossalai.launch(config={}, rank=rank, world_size=world_size, host=host, port=port, backend="nccl")
logger.info("Initializing TPInferEngine ...")
shard_config = ShardConfig(
enable_tensor_parallelism=True if self.tp_size > 1 else False, extra_kwargs={"inference_only": True}
)
self.infer_engine = TPInferEngine(
self.model, shard_config, self.max_batch_size, self.max_input_len, self.max_output_len
)
logger.info("TPInferEngine initialized successfully")
self.model = self.infer_engine.model
self.initialized = True
def preprocess(self, requests):
"""Basic text preprocessing, based on the user's chocie of application mode.
Args:
requests: The Input data in the form of text is passed on to the preprocess
function.
Returns:
list : The preprocess function returns a list of Tensor for the size of the word tokens.
"""
logger.info("Pre-processing requests")
input_ids_batch = None
attention_mask_batch = None
for idx, data in enumerate(requests):
input_text = data.get("data")
if input_text is None:
input_text = data.get("body")
if isinstance(input_text, (bytes, bytearray)):
input_text = input_text.decode("utf-8")
logger.info("Received text: '%s'", input_text)
inputs = self.tokenizer.encode_plus(
input_text,
max_length=self.max_input_len,
padding=True,
add_special_tokens=True,
return_tensors="pt",
truncation=True,
)
input_ids = inputs["input_ids"].to(self.device)
attention_mask = inputs["attention_mask"].to(self.device)
# making a batch out of the recieved requests
# attention masks are passed for cases where input tokens are padded.
if input_ids.shape is not None:
if input_ids_batch is None:
input_ids_batch = input_ids
attention_mask_batch = attention_mask
else:
input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
attention_mask_batch = torch.cat((attention_mask_batch, attention_mask), 0)
return (input_ids_batch, attention_mask_batch)
def inference(self, input_batch):
"""Predict the class (or classes) of the received text using the
serialized transformers checkpoint.
Args:
input_batch (list): List of Text Tensors from the pre-process function is passed here
Returns:
list : It returns a list of the predicted value for the input text
"""
input_ids_batch, attention_mask_batch = input_batch
inferences = []
do_sample = self.inference_config.get("do_sample", True)
top_p = self.inference_config.get("top_p", 0.95 if do_sample else 1.0)
top_k = self.inference_config.get("top_k", 60 if do_sample else 50)
input_ids_batch = input_ids_batch.to(self.device)
outputs = self.infer_engine.generate(
dict(input_ids=input_ids_batch, attention_mask=attention_mask_batch),
do_sample=do_sample,
top_p=top_p,
top_k=top_k,
)
for i, _ in enumerate(outputs):
inferences.append(self.tokenizer.decode(outputs[i], skip_special_tokens=True))
# For testing only
logger.info(
f"Generated text: {inferences}",
)
return inferences
def postprocess(self, inference_output):
"""Post Process Function converts the predicted response into Torchserve readable format.
Args:
inference_output (list): It contains the predicted response of the input text.
Returns:
(list): Returns a list of the Predictions and Explanations.
"""
return inference_output

View File

@@ -1,109 +0,0 @@
# Colossal-Inference with TorchServe
## Overview
This demo is used for testing and demonstrating the usage of Colossal Inference from `colossalai.inference` with deployment with TorchServe. It imports inference modules from colossalai and is based on
https://github.com/hpcaitech/ColossalAI/tree/3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0. For now, single-gpu inference serving is supported.
## Environment for testing
### Option #1: Use Conda Env
Records to create a conda env to test locally as follows. We might want to use docker or configure env on cloud platform later.
*NOTE*: It requires the installation of jdk and the set of `JAVA_HOME`. We recommend to install open-jdk-17 (Please refer to https://openjdk.org/projects/jdk/17/)
```bash
# use python 3.8 or 3.9
conda create -n infer python=3.9
# use torch 1.13+cuda11.6 for inference
pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
# conda cuda toolkit (e.g. nvcc, etc)
conda install -c "nvidia/label/cuda-11.6.2" cuda-toolkit
# install colossalai with PyTorch extensions
cd <path_to_ColossalAI_repo>
pip install -r requirements/requirements.txt
pip install -r requirements/requirements-test.txt
CUDA_EXT=1 pip install -e .
# install torchserve
cd <path_to_torch_serve_repo>
python ./ts_scripts/install_dependencies.py --cuda=cu116
pip install torchserve torch-model-archiver torch-workflow-archiver
```
### Option #2: Use Docker
To use the stable diffusion Docker image, you can build using the provided the [Dockerfile](./docker/Dockerfile).
```bash
# build from dockerfile
cd ColossalAI/examples/inference/serving/torch_serve/docker
docker build -t hpcaitech/colossal-infer-ts:0.2.0 .
```
Once you have the image ready, you can launch the image with the following command
```bash
cd ColossalAI/examples/inference/serving/torch_serve
# run the docker container
docker run --rm \
-it --gpus all \
--name <name_you_assign> \
-v <your-data-dir>:/data/scratch \
-w <ColossalAI_dir> \
hpcaitech/colossal-infer-ts:0.2.0 \
/bin/bash
```
## Steps to deploy a model
### 1.download/prepare a model
We will download a bloom model, and then zip the downloaded model. You could download the model from [HuggingFace](https://huggingface.co/models) manually, or you might want to refer to this script [download_model.py](https://github.com/pytorch/serve/blob/c3ca2599b4d36d2b61302064b02eab1b65e1908d/examples/large_models/utils/Download_model.py) provided by pytorch-serve team to help you download a snapshot of the model.
```bash
# download snapshots
cd <path_to_torch_serve>/examples/large_models/utils/
huggingface-cli login
python download_model.py --model_name bigscience/bloom-560m -o <path_to_store_downloaded_model>
# zip the model repo
cd <path_to_store_downloaded_model>/models--bigscience--bloom-560m/snapshots/<specific_revision>
zip -r <path_to_place_zipped_model>//model.zip *
```
> **_NOTE:_** The torch archiver and server will use `/tmp/` folder. Depending on the limit of disk quota, using torch-model-archiver might cause OSError "Disk quota exceeded". To prevent the OSError, set tmp dir environment variable as follows:
`export TMPDIR=<dir_with_enough_space>/tmp` and `export TEMP=<dir_with_enough_space>/tmp`,
or use relatively small models (as we did) for local testing.
### 2. Archive the model
With torch archiver, we will pack the model file (.zip) as well as handler file (.py) together into a .mar file. And then in serving process these files will be unpacked by TorchServe. Revelant model configs and inference configs can be set in `model-config.yaml`.
```bash
cd ./ColossalAI/examples/inference/serving/torch_serve
# create a folder under the current directory to store the packed model created by torch archiver
mkdir model_store
torch-model-archiver --model-name bloom --version 0.1 --handler Colossal_Inference_Handler.py --config-file model-config.yaml --extra-files <dir_zipped_model>/model.zip --export-path ./model_store/
```
### 3. Launch serving
Modify `load_models` in config.properties to select the model(s) stored in <model_store> directory to be deployed. By default we use `load_models=all` to load and deploy all the models (.mar) we have.
```bash
torchserve --start --ncs --ts-config config.properties
```
We could set inference, management, and metrics addresses and other TorchServe settings in `config.properties`.
TorchServe will create a folder `logs/` under the current directory to store ts, model, and metrics logs.
### 4. Run inference
```bash
# check inference status
curl http://0.0.0.0:8084/ping
curl -X POST http://localhost:8084/predictions/bloom -T sample_text.txt
```
To stop TorchServe, run `torchserve --stop`

View File

@@ -1,10 +0,0 @@
inference_address=http://0.0.0.0:8084
management_address=http://0.0.0.0:8085
metrics_address=http://0.0.0.0:8086
enable_envvars_config=true
install_py_dep_per_model=true
number_of_gpu=1
load_models=all
max_response_size=655350000
default_response_timeout=6000
model_store=./model_store

View File

@@ -1,57 +0,0 @@
FROM hpcaitech/pytorch-cuda:1.13.0-11.6.0
# enable passwordless ssh
RUN mkdir ~/.ssh && \
printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config && \
ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
# install curl
RUN apt-get update && \
apt-get -y install curl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Download and extract OpenJDK 17
ENV JAVA_HOME /opt/openjdk-17
RUN apt-get update && \
apt-get install -y wget && \
wget -q https://download.java.net/openjdk/jdk17/ri/openjdk-17+35_linux-x64_bin.tar.gz -O /tmp/openjdk.tar.gz && \
mkdir -p $JAVA_HOME && \
tar xzf /tmp/openjdk.tar.gz -C $JAVA_HOME --strip-components=1 && \
rm /tmp/openjdk.tar.gz && \
apt-get purge -y --auto-remove wget && \
rm -rf /var/lib/apt/lists/*
ENV PATH $JAVA_HOME/bin:$PATH
RUN export JAVA_HOME
RUN java -version
# install ninja
RUN apt-get update && \
apt-get install -y --no-install-recommends ninja-build && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# install colossalai
ARG VERSION=main
RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git && \
cd ./ColossalAI && \
git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 && \
CUDA_EXT=1 pip install -v --no-cache-dir .
# install titans
RUN pip install --no-cache-dir titans
# install transformers
RUN pip install --no-cache-dir transformers
# install triton
RUN pip install --no-cache-dir triton==2.0.0.dev20221202
# install torchserve
ARG VERSION=master
RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git && \
cd ./serve && \
python ./ts_scripts/install_dependencies.py --cuda=cu116 && \
pip install torchserve torch-model-archiver torch-workflow-archiver

View File

@@ -1,16 +0,0 @@
# TS frontend parameters settings
minWorkers: 1 # minimum number of workers of a model
maxWorkers: 1 # maximum number of workers of a model
batchSize: 8 # batch size of a model
maxBatchDelay: 100 # maximum delay of a batch (ms)
responseTimeout: 120 # timeout of a specific model's response (*in sec)
deviceType: "gpu"
# deviceIds: [0, 1] # seting CUDA_VISIBLE_DEVICES
handler:
mode: "text_generation"
model_type: "bloom"
tp_size: 1
max_batch_size: 8
max_input_len: 1024
max_output_len: 128

View File

@@ -1 +0,0 @@
Introduce some landmarks in Beijing