mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-05 02:51:59 +00:00
[inference] Refactor inference architecture (#5057)
* [inference] support only TP (#4998) * support only tp * enable tp * add support for bloom (#5008) * [refactor] refactor gptq and smoothquant llama (#5012) * refactor gptq and smoothquant llama * fix import error * fix linear import torch-int * fix smoothquant llama import error * fix import accelerate error * fix bug * fix import smooth cuda * fix smoothcuda * [Inference Refactor] Merge chatglm2 with pp and tp (#5023) merge chatglm with pp and tp * [Refactor] remove useless inference code (#5022) * remove useless code * fix quant model * fix test import bug * mv original inference legacy * fix chatglm2 * [Refactor] refactor policy search and quant type controlling in inference (#5035) * [Refactor] refactor policy search and quant type controling in inference * [inference] update readme (#5051) * update readme * update readme * fix architecture * fix table * fix table * [inference] udpate example (#5053) * udpate example * fix run.sh * fix rebase bug * fix some errors * update readme * add some features * update interface * update readme * update benchmark * add requirements-infer --------- Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com> Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
This commit is contained in:
@@ -1,72 +0,0 @@
|
||||
import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
from transformers import BloomForCausalLM
|
||||
from transformers.models.bloom.configuration_bloom import BloomConfig
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference.tensor_parallel import TPInferEngine
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.shardformer import ShardConfig
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
try:
|
||||
HAS_LIGHTLLM_KERNEL = True
|
||||
except:
|
||||
HAS_LIGHTLLM_KERNEL = False
|
||||
|
||||
TP_SIZE = 2
|
||||
MAX_BATCH_SIZE = 4
|
||||
MAX_INPUT_LEN = 16
|
||||
MAX_OUTPUT_LEN = 32
|
||||
|
||||
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
|
||||
|
||||
|
||||
@parameterize(
|
||||
"test_config",
|
||||
[
|
||||
{
|
||||
"tp_size": TP_SIZE,
|
||||
}
|
||||
],
|
||||
)
|
||||
def run(test_config):
|
||||
bloom_config = BloomConfig(num_hidden_layers=2, bos_token_id=0, eos_token_id=1, vocab_size=1200, hidden_size=1024)
|
||||
model = BloomForCausalLM(bloom_config)
|
||||
model = model.half()
|
||||
|
||||
shard_config = ShardConfig(
|
||||
enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
|
||||
)
|
||||
infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
|
||||
generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
|
||||
|
||||
input_tokens = {
|
||||
"input_ids": torch.randint(1, 1000, (MAX_BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
|
||||
"attention_mask": torch.ones((MAX_BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
|
||||
}
|
||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
||||
|
||||
assert outputs is not None
|
||||
|
||||
|
||||
def check_bloom(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
||||
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
||||
)
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_bloom_infer():
|
||||
spawn(check_bloom, TP_SIZE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_bloom_infer()
|
@@ -1,83 +0,0 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference.tensor_parallel.engine import TPInferEngine
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.shardformer import ShardConfig
|
||||
from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
|
||||
from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
try:
|
||||
import lightllm # noqa
|
||||
|
||||
HAS_LIGHTLLM_KERNEL = True
|
||||
except:
|
||||
HAS_LIGHTLLM_KERNEL = False
|
||||
|
||||
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
||||
TPSIZE = 2
|
||||
BATCH_SIZE = 8
|
||||
MAX_INPUT_LEN = 12
|
||||
MAX_OUTPUT_LEN = 100
|
||||
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
|
||||
|
||||
|
||||
@parameterize(
|
||||
"test_config",
|
||||
[
|
||||
{
|
||||
"tp_size": TPSIZE,
|
||||
}
|
||||
],
|
||||
)
|
||||
def run_chatglm2_test(test_config):
|
||||
chatglm_config = ChatGLMConfig(
|
||||
num_layers=2,
|
||||
vocab_size=1200,
|
||||
use_cache=True,
|
||||
multi_query_attention=True,
|
||||
multi_query_group_num=2,
|
||||
num_attention_heads=8,
|
||||
hidden_size=1024,
|
||||
)
|
||||
model = ChatGLMForConditionalGeneration(chatglm_config)
|
||||
model = model.half()
|
||||
|
||||
shard_config = ShardConfig(
|
||||
enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
|
||||
)
|
||||
infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
|
||||
generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
|
||||
|
||||
input_tokens = {
|
||||
"input_ids": torch.randint(1, 1000, (BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
|
||||
"attention_mask": torch.ones((BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
|
||||
}
|
||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
||||
assert outputs is not None
|
||||
|
||||
|
||||
def check_chatglm2(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_chatglm2_test()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
||||
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
||||
)
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_chatglm2():
|
||||
spawn(check_chatglm2, TPSIZE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_chatglm2()
|
@@ -1,14 +0,0 @@
|
||||
engine_config:
|
||||
model: MODEL_PATH
|
||||
tensor_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_input_len: 1024
|
||||
max_output_len: 512
|
||||
# config for app router deployment
|
||||
# Resources assigned to each model replica. This should correspond to Ray AIR ScalingConfig.
|
||||
router_config:
|
||||
max_total_token_num: 4096
|
||||
batch_max_tokens: 4096
|
||||
disable_log_stats: False
|
||||
log_stats_interval: 10
|
||||
model: MODEL_PATH
|
@@ -1,61 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference.async_engine import Async_Engine
|
||||
from colossalai.inference.dynamic_batching.ray_init_config import RayInitConfig
|
||||
from colossalai.inference.dynamic_batching.sampling_params import SamplingParams
|
||||
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
|
||||
|
||||
PATH = "config.yaml"
|
||||
|
||||
|
||||
def run_async_engine(path: str):
|
||||
if not os.path.exists(path):
|
||||
return
|
||||
|
||||
config = RayInitConfig.from_yaml_path(path)
|
||||
engine_config = config.engine_config_data
|
||||
model = engine_config.model
|
||||
if model is None or not os.path.exists(model):
|
||||
return
|
||||
|
||||
prompt = "Introduce some landmarks in London.\n The Tower of London is a historic castle on the north bank of the River Thames in central London. It was founded towards the end of 10"
|
||||
sampling_params = SamplingParams()
|
||||
asyncio.run(asy_for_loop_test(config, prompt, sampling_params))
|
||||
|
||||
|
||||
async def get_result(engine, prompt, sampling_params):
|
||||
request_id = str(uuid.uuid4().hex)
|
||||
results = engine.generate(request_id, prompt, sampling_params)
|
||||
async for result in results:
|
||||
# print(result)
|
||||
assert result is not None
|
||||
|
||||
|
||||
async def asy_for_loop_test(config, prompt, sampling_params):
|
||||
router_config = config.router_config_data
|
||||
engine_config = config.engine_config_data
|
||||
engine = Async_Engine(router_config=router_config, engine_config=engine_config)
|
||||
for i in range(10):
|
||||
print("in for loop", i)
|
||||
await get_result(engine, prompt, sampling_params)
|
||||
|
||||
|
||||
def check_async_engine(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_async_engine(PATH)
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_async_engine():
|
||||
spawn(check_async_engine, 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_async_engine()
|
@@ -1,95 +0,0 @@
|
||||
import pytest
|
||||
from transformers import LlamaForCausalLM
|
||||
from transformers.models.llama.configuration_llama import LlamaConfig
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference.dynamic_batching.io_struct import Req
|
||||
from colossalai.inference.dynamic_batching.sampling_params import SamplingParams
|
||||
from colossalai.inference.manager import DynamicBatchManager
|
||||
from colossalai.inference.tensor_parallel import TPInferEngine
|
||||
from colossalai.shardformer import ShardConfig
|
||||
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
|
||||
|
||||
TP_SIZE = 1
|
||||
BATCH_SIZE = 2
|
||||
MAX_INPUT_LEN = 48
|
||||
MAX_OUTPUT_LEN = 256
|
||||
|
||||
|
||||
def run():
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
req1 = Req(0, [1], sampling_params)
|
||||
req2 = Req(1, [2], sampling_params)
|
||||
req3 = Req(2, [3], sampling_params)
|
||||
# req 1-3 are initiliazed as token forward requests
|
||||
req4 = Req(3, [10, 10, 10, 9, 1], sampling_params)
|
||||
waiting_list = []
|
||||
waiting_list.append(req1)
|
||||
waiting_list.append(req2)
|
||||
waiting_list.append(req3)
|
||||
|
||||
# init model and tp engine
|
||||
llama_config = LlamaConfig(num_hidden_layers=2, bos_token_id=0, eos_token_id=1, vocab_size=1200, hidden_size=1024)
|
||||
model = LlamaForCausalLM(llama_config)
|
||||
model = model.half()
|
||||
|
||||
shard_config = ShardConfig(enable_tensor_parallelism=False, extra_kwargs={"inference_only": True})
|
||||
infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
|
||||
|
||||
dynamic_batch_manager = DynamicBatchManager(
|
||||
tp_engine=infer_engine,
|
||||
max_total_token_num=640,
|
||||
batch_max_tokens=608,
|
||||
eos_id=0,
|
||||
log_stats=False,
|
||||
log_stats_interval=10,
|
||||
waiting_req_list=waiting_list,
|
||||
model="llama",
|
||||
)
|
||||
before_add = len(dynamic_batch_manager.req_queue)
|
||||
|
||||
# test add req function
|
||||
dynamic_batch_manager.add_req(req4.request_id, req4.prompt_ids, req4.sample_params)
|
||||
assert len(dynamic_batch_manager.req_queue.waiting_req_list) == before_add + 1
|
||||
|
||||
# test abort function
|
||||
dynamic_batch_manager.abort(req4.request_id)
|
||||
assert dynamic_batch_manager.req_queue.waiting_req_list[-1].aborted == True
|
||||
|
||||
# test filter batch function, loop_for_fwd, _step, _init_batch and _prefill/_decode batch are tested
|
||||
batch = dynamic_batch_manager.req_queue.generate_new_batch()
|
||||
assert len(batch) == 2
|
||||
|
||||
dynamic_batch_manager._init_batch(batch)
|
||||
assert dynamic_batch_manager.engine.cache[batch.batch_id] is not None
|
||||
|
||||
batch.reqs[0].has_generate_finished = True
|
||||
# filter one finished
|
||||
batch.filter_finished()
|
||||
dynamic_batch_manager._filter_batch(batch)
|
||||
assert len(dynamic_batch_manager.engine.cache) == 1
|
||||
|
||||
# test merge batch
|
||||
new_batch = dynamic_batch_manager.req_queue.generate_new_batch(batch)
|
||||
assert len(new_batch) == 1
|
||||
dynamic_batch_manager._init_batch(new_batch)
|
||||
dynamic_batch_manager._merge_batch(batch, new_batch)
|
||||
|
||||
assert len(dynamic_batch_manager.engine.cache[batch.batch_id]) == 2
|
||||
|
||||
|
||||
def check_dynamic_batching_manager(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_dynamic_batching_manager():
|
||||
spawn(check_dynamic_batching_manager, 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_dynamic_batching_manager()
|
@@ -1,86 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
from transformers import LlamaForCausalLM
|
||||
from transformers.models.llama.configuration_llama import LlamaConfig
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference.dynamic_batching.io_struct import Req
|
||||
from colossalai.inference.dynamic_batching.sampling_params import SamplingParams
|
||||
from colossalai.inference.manager import start_dynamic_batching
|
||||
from colossalai.inference.tensor_parallel import TPInferEngine
|
||||
from colossalai.shardformer import ShardConfig
|
||||
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
|
||||
|
||||
TP_SIZE = 1
|
||||
MAX_BATCH_SIZE = 2
|
||||
MAX_INPUT_LEN = 5
|
||||
MAX_OUTPUT_LEN = 16
|
||||
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
|
||||
|
||||
|
||||
@dataclass
|
||||
class args:
|
||||
max_total_token_num: int
|
||||
batch_max_tokens: int
|
||||
model: str
|
||||
eos_id: int
|
||||
disable_log_stats: bool
|
||||
log_stats_interval: int
|
||||
|
||||
|
||||
def run():
|
||||
arg = args(
|
||||
max_total_token_num=42,
|
||||
model="llama",
|
||||
batch_max_tokens=42,
|
||||
eos_id=0,
|
||||
disable_log_stats=False,
|
||||
log_stats_interval=10,
|
||||
)
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
req1 = Req(0, [0, 0, 10, 6, 8], sampling_params)
|
||||
req2 = Req(1, [10, 10, 10, 10, 10], sampling_params)
|
||||
req3 = Req(2, [0, 0, 10, 10, 10], sampling_params)
|
||||
req4 = Req(3, [0, 0, 10, 10, 10], sampling_params)
|
||||
|
||||
waiting_list = []
|
||||
waiting_list.append(req1)
|
||||
waiting_list.append(req2)
|
||||
waiting_list.append(req3)
|
||||
waiting_list.append(req4)
|
||||
|
||||
llama_config = LlamaConfig(num_hidden_layers=2, bos_token_id=0, eos_token_id=1, vocab_size=30000, hidden_size=1024)
|
||||
model = LlamaForCausalLM(llama_config)
|
||||
model = model.half()
|
||||
|
||||
shard_config = ShardConfig(
|
||||
enable_tensor_parallelism=True if TP_SIZE > 1 else False, extra_kwargs={"inference_only": True}
|
||||
)
|
||||
|
||||
infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
|
||||
batch_manager = start_dynamic_batching(arg, tp_engine=infer_engine, waiting_req_list=waiting_list)
|
||||
|
||||
ans_gen = batch_manager.generate(request_id=5, prompts="hello", sampling_params=sampling_params)
|
||||
for result in ans_gen:
|
||||
assert result is not None
|
||||
|
||||
|
||||
def check_dynamic_forward(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run()
|
||||
|
||||
|
||||
@pytest.mark.skipif(not CUDA_SUPPORT, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_dynamic_batching():
|
||||
spawn(check_dynamic_forward, TP_SIZE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_dynamic_batching()
|
@@ -1,66 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference.dynamic_batching.ray_dist_init import Driver
|
||||
from colossalai.inference.dynamic_batching.ray_init_config import RayInitConfig
|
||||
from colossalai.inference.dynamic_batching.sampling_params import SamplingParams
|
||||
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
|
||||
|
||||
PATH = "config.yaml"
|
||||
|
||||
|
||||
def run_ray_dist(path: str):
|
||||
if not os.path.exists(path):
|
||||
return
|
||||
config = RayInitConfig.from_yaml_path(path)
|
||||
router_config = config.router_config_data
|
||||
engine_config = config.engine_config_data
|
||||
model = engine_config.model
|
||||
if model is None or not os.path.exists(model):
|
||||
return
|
||||
driver = Driver(router_config=router_config, engine_config=engine_config)
|
||||
prompt = "Introduce some landmarks in Beijing"
|
||||
|
||||
request_id = str(uuid.uuid4().hex)
|
||||
sampling_params = SamplingParams()
|
||||
print("sampling_params: ", sampling_params)
|
||||
|
||||
async def get_result(request_id, prompt, sampling_params):
|
||||
return await driver.async_generate(request_id, prompt, sampling_params)
|
||||
|
||||
for test_async in [True, False]:
|
||||
if test_async:
|
||||
print("test_async: ", test_async)
|
||||
result = asyncio.run(get_result(request_id, prompt, sampling_params))
|
||||
assert result is not None
|
||||
print("result: ", result)
|
||||
else:
|
||||
print("test_async: ", test_async)
|
||||
result = driver.generate(request_id, prompt, sampling_params)
|
||||
assert result is not None
|
||||
print("result: ", result)
|
||||
|
||||
is_running = None
|
||||
is_running = driver.is_running()
|
||||
assert is_running is not None
|
||||
print("is_running: ", is_running)
|
||||
|
||||
|
||||
def check_ray_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_ray_dist(PATH)
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_ray_dist():
|
||||
spawn(check_ray_dist, 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_ray_dist()
|
105
tests/test_infer/test_hybrid_bloom.py
Normal file
105
tests/test_infer/test_hybrid_bloom.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import importlib.util
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import transformers
|
||||
from packaging import version
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference import CaiInferEngine
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
|
||||
HAS_LIGHTLLM_KERNEL = True
|
||||
|
||||
if importlib.util.find_spec("lightllm") is None:
|
||||
HAS_LIGHTLLM_KERNEL = False
|
||||
|
||||
|
||||
def data_gen():
|
||||
input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
|
||||
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
|
||||
return dict(input_ids=input_ids, attention_mask=attention_mask)
|
||||
|
||||
|
||||
inputs = data_gen()
|
||||
for k, v in inputs.items():
|
||||
if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
|
||||
new_shape = [1] * v.dim()
|
||||
new_shape[0] = 16
|
||||
inputs[k] = v.to("cuda").repeat(*new_shape)
|
||||
|
||||
|
||||
def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||
model = transformers.BloomForCausalLM(
|
||||
transformers.BloomConfig(vocab_size=20000, hidden_size=512, n_head=4, n_layer=4)
|
||||
)
|
||||
|
||||
engine = CaiInferEngine(
|
||||
tp_size=tp_size,
|
||||
pp_size=pp_size,
|
||||
model=model,
|
||||
max_output_len=max_output_len,
|
||||
micro_batch_size=micro_batch_size,
|
||||
)
|
||||
output = engine.generate(inputs)
|
||||
if dist.get_rank() == 0:
|
||||
assert len(output[0]) == max_output_len, f"{len(output)}, {max_output_len}"
|
||||
|
||||
|
||||
@parameterize("tp_size", [1])
|
||||
@parameterize("pp_size", [2])
|
||||
@parameterize("max_output_len", [4])
|
||||
@parameterize("micro_batch_size", [1])
|
||||
@clear_cache_before_run()
|
||||
def run_pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||
pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@parameterize("tp_size", [2])
|
||||
@parameterize("pp_size", [2])
|
||||
@parameterize("max_output_len", [4])
|
||||
@parameterize("micro_batch_size", [1])
|
||||
@clear_cache_before_run()
|
||||
def run_tp_pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||
pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@parameterize("tp_size", [2])
|
||||
@parameterize("pp_size", [1])
|
||||
@parameterize("max_output_len", [2])
|
||||
@parameterize("micro_batch_size", [1])
|
||||
@clear_cache_before_run()
|
||||
def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||
pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def check_tp_pipeline_inference(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_tp_pipeline_inference_test()
|
||||
|
||||
|
||||
def check_single_inference(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_tp_inference_test()
|
||||
run_pipeline_inference_test()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
||||
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
||||
)
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_pipeline_inference():
|
||||
spawn(check_tp_pipeline_inference, nprocs=4)
|
||||
spawn(check_single_inference, nprocs=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_pipeline_inference()
|
113
tests/test_infer/test_hybrid_chatglm2.py
Normal file
113
tests/test_infer/test_hybrid_chatglm2.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import importlib.util
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from packaging import version
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference import CaiInferEngine
|
||||
from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
|
||||
from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
|
||||
HAS_LIGHTLLM_KERNEL = True
|
||||
|
||||
if importlib.util.find_spec("lightllm") is None:
|
||||
HAS_LIGHTLLM_KERNEL = False
|
||||
|
||||
|
||||
def data_gen():
|
||||
input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
|
||||
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
|
||||
return dict(input_ids=input_ids, attention_mask=attention_mask)
|
||||
|
||||
|
||||
inputs = data_gen()
|
||||
for k, v in inputs.items():
|
||||
if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
|
||||
new_shape = [1] * v.dim()
|
||||
new_shape[0] = 16
|
||||
inputs[k] = v.to("cuda").repeat(*new_shape)
|
||||
|
||||
|
||||
def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||
chatglm_config = ChatGLMConfig(
|
||||
num_layers=2,
|
||||
vocab_size=20000,
|
||||
use_cache=True,
|
||||
multi_query_attention=True,
|
||||
multi_query_group_num=2,
|
||||
num_attention_heads=8,
|
||||
hidden_size=1024,
|
||||
)
|
||||
model = ChatGLMForConditionalGeneration(chatglm_config)
|
||||
|
||||
engine = CaiInferEngine(
|
||||
tp_size=tp_size,
|
||||
pp_size=pp_size,
|
||||
model=model,
|
||||
max_output_len=max_output_len,
|
||||
micro_batch_size=micro_batch_size,
|
||||
)
|
||||
output = engine.generate(inputs)
|
||||
if dist.get_rank() == 0:
|
||||
assert len(output[0]) == max_output_len, f"{len(output)}, {max_output_len}"
|
||||
|
||||
|
||||
@parameterize("tp_size", [1])
|
||||
@parameterize("pp_size", [2])
|
||||
@parameterize("max_output_len", [4])
|
||||
@parameterize("micro_batch_size", [1])
|
||||
@clear_cache_before_run()
|
||||
def run_pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||
pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@parameterize("tp_size", [2])
|
||||
@parameterize("pp_size", [2])
|
||||
@parameterize("max_output_len", [4])
|
||||
@parameterize("micro_batch_size", [1])
|
||||
@clear_cache_before_run()
|
||||
def run_tp_pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||
pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
@parameterize("tp_size", [2])
|
||||
@parameterize("pp_size", [1])
|
||||
@parameterize("max_output_len", [2])
|
||||
@parameterize("micro_batch_size", [1])
|
||||
@clear_cache_before_run()
|
||||
def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||
pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def check_tp_pipeline_inference(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_tp_pipeline_inference_test()
|
||||
|
||||
|
||||
def check_single_inference(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_tp_inference_test()
|
||||
run_pipeline_inference_test()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
||||
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
||||
)
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_pipeline_inference():
|
||||
spawn(check_tp_pipeline_inference, nprocs=4)
|
||||
spawn(check_single_inference, nprocs=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_pipeline_inference()
|
@@ -1,3 +1,5 @@
|
||||
import importlib.util
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@@ -5,11 +7,18 @@ import transformers
|
||||
from packaging import version
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference import CaiInferEngine, LlamaModelInferPolicy
|
||||
from colossalai.inference import CaiInferEngine
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
|
||||
|
||||
import importlib.util
|
||||
|
||||
HAS_LIGHTLLM_KERNEL = True
|
||||
|
||||
if importlib.util.find_spec("lightllm") is None:
|
||||
HAS_LIGHTLLM_KERNEL = False
|
||||
|
||||
|
||||
def data_gen():
|
||||
input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
|
||||
@@ -36,11 +45,10 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||
tp_size=tp_size,
|
||||
pp_size=pp_size,
|
||||
model=model,
|
||||
model_policy=LlamaModelInferPolicy(),
|
||||
max_output_len=max_output_len,
|
||||
micro_batch_size=micro_batch_size,
|
||||
)
|
||||
output = engine.inference(inputs)
|
||||
output = engine.generate(inputs)
|
||||
if dist.get_rank() == 0:
|
||||
assert len(output[0]) == max_output_len, f"{len(output)}, {max_output_len}"
|
||||
|
||||
@@ -65,9 +73,14 @@ def run_tp_pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def check_pipeline_inference(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_pipeline_inference_test()
|
||||
@parameterize("tp_size", [2])
|
||||
@parameterize("pp_size", [1])
|
||||
@parameterize("max_output_len", [2])
|
||||
@parameterize("micro_batch_size", [1])
|
||||
@clear_cache_before_run()
|
||||
def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||
pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def check_tp_pipeline_inference(rank, world_size, port):
|
||||
@@ -75,13 +88,22 @@ def check_tp_pipeline_inference(rank, world_size, port):
|
||||
run_tp_pipeline_inference_test()
|
||||
|
||||
|
||||
@pytest.mark.skipif(not CUDA_SUPPORT, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
|
||||
def check_single_inference(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_tp_inference_test()
|
||||
run_pipeline_inference_test()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
||||
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
||||
)
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_pipeline_inference():
|
||||
spawn(check_pipeline_inference, nprocs=2)
|
||||
spawn(check_tp_pipeline_inference, nprocs=4)
|
||||
spawn(check_single_inference, nprocs=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
@@ -1,102 +0,0 @@
|
||||
from itertools import accumulate
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
from transformers import BloomConfig, BloomForCausalLM
|
||||
from transformers.tokenization_utils_base import BatchEncoding
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference.tensor_parallel import TPInferEngine
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.shardformer import ShardConfig
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
TP_SIZE = 2
|
||||
MAX_BATCH_SIZE = 4
|
||||
MAX_INPUT_LEN = 16
|
||||
MAX_OUTPUT_LEN = 8
|
||||
|
||||
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
|
||||
|
||||
|
||||
@parameterize(
|
||||
"test_config",
|
||||
[
|
||||
{
|
||||
"tp_size": TP_SIZE,
|
||||
}
|
||||
],
|
||||
)
|
||||
def run(test_config):
|
||||
model_config = BloomConfig(num_hidden_layers=4, hidden_size=128, intermediate_size=256, num_attention_heads=4)
|
||||
model = BloomForCausalLM(model_config)
|
||||
model = model.half()
|
||||
model.to(torch.cuda.current_device())
|
||||
|
||||
# 1. check TPInferEngine init and model optimization
|
||||
shard_config = ShardConfig(
|
||||
enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
|
||||
)
|
||||
infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
|
||||
|
||||
assert infer_engine.cache_manager is not None
|
||||
assert infer_engine.tp_size == TP_SIZE
|
||||
assert infer_engine.head_num == model_config.num_attention_heads // TP_SIZE
|
||||
|
||||
# 2. check data preparation
|
||||
input_ids_list = [
|
||||
[80540, 15473, 3331, 11970, 90472, 361, 61335],
|
||||
[80540, 15473, 3331, 11970],
|
||||
[80540, 15473, 3331, 11970],
|
||||
[80540, 15473],
|
||||
]
|
||||
batch_size = len(input_ids_list)
|
||||
max_seq_len = max(len(li) for li in input_ids_list)
|
||||
attention_mask = [[0] * max_seq_len for _ in range(batch_size)]
|
||||
for i, li in enumerate(input_ids_list):
|
||||
attention_mask[i][max_seq_len - len(li) :] = [1 for _ in range(len(li))]
|
||||
data = dict(input_ids=input_ids_list, attention_mask=attention_mask)
|
||||
inputs_batch_encoding = BatchEncoding(data=data)
|
||||
seq_lengths = [len(li) for li in input_ids_list]
|
||||
start_loc = list(accumulate([0] + seq_lengths[:-1]))
|
||||
seq_lengths = torch.tensor(seq_lengths, dtype=torch.int32)
|
||||
start_loc = torch.tensor(start_loc, dtype=torch.int32)
|
||||
# input token id list as inputs
|
||||
batch_state_out1 = infer_engine.prepare_batch_state(inputs_batch_encoding)
|
||||
# BatchEncoding as inputs
|
||||
batch_state_out2 = infer_engine.prepare_batch_state(input_ids_list)
|
||||
|
||||
assert batch_state_out1.batch_size == batch_state_out2.batch_size == batch_size
|
||||
assert torch.equal(batch_state_out1.seq_len, batch_state_out2.seq_len)
|
||||
|
||||
# The following tests are discarded for now, and will be reused after all features are added
|
||||
# assert torch.equal(batch_state_out1.seq_len.to(seq_lengths.device), seq_lengths)
|
||||
# assert torch.equal(batch_state_out2.seq_len.to(seq_lengths.device), seq_lengths)
|
||||
# assert torch.equal(batch_state_out1.start_loc.to(start_loc.device), start_loc)
|
||||
# assert torch.equal(batch_state_out2.start_loc.to(start_loc.device), start_loc)
|
||||
|
||||
# 3. check optimized model generate
|
||||
input_ids = torch.randint(low=10, high=1000, size=(MAX_BATCH_SIZE, MAX_INPUT_LEN))
|
||||
generate_kwargs = dict(do_sample=False)
|
||||
infer_engine.generate(input_ids, **generate_kwargs)
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def check_engine(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run()
|
||||
|
||||
|
||||
@pytest.mark.skipif(not CUDA_SUPPORT, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_engine():
|
||||
spawn(check_engine, TP_SIZE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_engine()
|
@@ -4,7 +4,7 @@ import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
|
||||
from colossalai.inference.tensor_parallel import MemoryManager
|
||||
from colossalai.inference.kv_cache import MemoryManager
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
@@ -1,77 +0,0 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
from transformers import LlamaForCausalLM
|
||||
from transformers.models.llama.configuration_llama import LlamaConfig
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference.tensor_parallel.engine import TPInferEngine
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.shardformer import ShardConfig
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
try:
|
||||
HAS_LIGHTLLM_KERNEL = True
|
||||
except:
|
||||
HAS_LIGHTLLM_KERNEL = False
|
||||
|
||||
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
||||
TPSIZE = 2
|
||||
BATCH_SIZE = 8
|
||||
MAX_INPUT_LEN = 12
|
||||
MAX_OUTPUT_LEN = 100
|
||||
|
||||
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
|
||||
|
||||
|
||||
@parameterize(
|
||||
"test_config",
|
||||
[
|
||||
{
|
||||
"tp_size": TPSIZE,
|
||||
}
|
||||
],
|
||||
)
|
||||
def run_llama_test(test_config):
|
||||
llama_config = LlamaConfig(
|
||||
num_hidden_layers=2, num_key_value_heads=8, bos_token_id=0, eos_token_id=1, vocab_size=1200, hidden_size=1024
|
||||
)
|
||||
model = LlamaForCausalLM(llama_config)
|
||||
model = model.half()
|
||||
|
||||
shard_config = ShardConfig(
|
||||
enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
|
||||
)
|
||||
infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
|
||||
generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
|
||||
|
||||
input_tokens = {
|
||||
"input_ids": torch.randint(1, 1000, (BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
|
||||
"attention_mask": torch.ones((BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
|
||||
}
|
||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
||||
|
||||
assert outputs is not None
|
||||
|
||||
|
||||
def check_llama(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_llama_test()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
||||
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
||||
)
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_llama():
|
||||
spawn(check_llama, TPSIZE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_llama()
|
@@ -1,75 +0,0 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
from transformers import LlamaForCausalLM
|
||||
from transformers.models.llama.configuration_llama import LlamaConfig
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference.tensor_parallel.engine import TPInferEngine
|
||||
from colossalai.logging import disable_existing_loggers
|
||||
from colossalai.shardformer import ShardConfig
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
try:
|
||||
HAS_LIGHTLLM_KERNEL = True
|
||||
except:
|
||||
HAS_LIGHTLLM_KERNEL = False
|
||||
|
||||
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
||||
TPSIZE = 2
|
||||
BATCH_SIZE = 8
|
||||
MAX_INPUT_LEN = 12
|
||||
MAX_OUTPUT_LEN = 100
|
||||
|
||||
CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
|
||||
|
||||
|
||||
@parameterize(
|
||||
"test_config",
|
||||
[
|
||||
{
|
||||
"tp_size": TPSIZE,
|
||||
}
|
||||
],
|
||||
)
|
||||
def run_llama_test(test_config):
|
||||
llama_config = LlamaConfig(num_hidden_layers=2, bos_token_id=0, eos_token_id=1, vocab_size=1200, hidden_size=1024)
|
||||
model = LlamaForCausalLM(llama_config)
|
||||
model = model.half()
|
||||
|
||||
shard_config = ShardConfig(
|
||||
enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
|
||||
)
|
||||
infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
|
||||
generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
|
||||
|
||||
input_tokens = {
|
||||
"input_ids": torch.randint(1, 1000, (BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
|
||||
"attention_mask": torch.ones((BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
|
||||
}
|
||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
||||
|
||||
assert outputs is not None
|
||||
|
||||
|
||||
def check_llama(rank, world_size, port):
|
||||
disable_existing_loggers()
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||
run_llama_test()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
||||
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
||||
)
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
@clear_cache_before_run()
|
||||
def test_llama():
|
||||
spawn(check_llama, TPSIZE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_llama()
|
Reference in New Issue
Block a user