[Inference]Adapt temperature processing logic (#5689)

* Adapt temperature processing logic

* add ValueError for top_p and top_k

* add GQA Test

* fix except_msg
This commit is contained in:
yuehuayingxueluo
2024-05-08 17:58:29 +08:00
committed by GitHub
parent 12e7c28d5e
commit 9c2fe7935f
3 changed files with 36 additions and 6 deletions

View File

@@ -28,7 +28,12 @@ def check_inference_engine(use_engine=False, prompt_template=None, do_sample=Tru
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
model = LlamaForCausalLM(
LlamaConfig(
vocab_size=50000, hidden_size=512, intermediate_size=1536, num_attention_heads=4, num_hidden_layers=16
vocab_size=50000,
hidden_size=512,
intermediate_size=1536,
num_attention_heads=4,
num_key_value_heads=2,
num_hidden_layers=16,
)
).cuda()
model = model.eval()