[Inference]Adapt temperature processing logic (#5689)

* Adapt temperature processing logic * add ValueError for top_p and top_k * add GQA Test * fix except_msg
2025-09-05 11:02:05 +00:00 · 2024-05-08 17:58:29 +08:00
parent 12e7c28d5e
commit 9c2fe7935f
3 changed files with 36 additions and 6 deletions
--- a/tests/test_infer/test_inference_engine.py
+++ b/tests/test_infer/test_inference_engine.py
@@ -28,7 +28,12 @@ def check_inference_engine(use_engine=False, prompt_template=None, do_sample=Tru
    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
    model = LlamaForCausalLM(
        LlamaConfig(
-            vocab_size=50000, hidden_size=512, intermediate_size=1536, num_attention_heads=4, num_hidden_layers=16
+            vocab_size=50000,
+            hidden_size=512,
+            intermediate_size=1536,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            num_hidden_layers=16,
        )
    ).cuda()
    model = model.eval()