From 44d4053fec005fe0b06b6bc755fdc962463145df Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Fri, 7 Mar 2025 14:14:26 +0800
Subject: [PATCH] [HotFix] update load lora model Readme; (#6240)

* [fix] update load lora model Readme;

* [fix] update lora infer readme

* [fix] remove useless comments
---
 applications/ColossalChat/examples/README.md | 57 ++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md
index af10dea25..f111660a8 100755
--- a/applications/ColossalChat/examples/README.md
+++ b/applications/ColossalChat/examples/README.md
@@ -892,6 +892,63 @@ The dialogues can by multiple turns and it can contain system prompt. For more d
 
 We use bf16 weights for finetuning. If you downloaded fp8 DeepSeek V3/R1 weights, you can use the [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) to convert the weights to bf16 via GPU. For Ascend NPU, you can use this [script](https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/LLM/DeepSeek/DeepSeek-V2/NPU_inference/fp8_cast_bf16.py).
 
+We have also added details on how to load and reason with lora models.
+```python
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
+from peft import (
+    PeftModel
+)
+import torch
+
+# Set model path
+model_name = "Qwen/Qwen2.5-3B"
+lora_adapter = "Qwen2.5-3B_lora" # Your lora model Path
+merged_model_path = "Qwen2.5-3B_merged"
+
+######
+# How to Load lora Model
+######
+# 1.Load base model
+base_model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True
+)
+
+# 2.Load lora model
+peft_model = PeftModel.from_pretrained(
+    base_model,
+    lora_adapter,
+    torch_dtype=torch.bfloat16
+)
+
+# 3.Merge lora model
+merged_model = peft_model.merge_and_unload()
+
+# 4.Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(
+    model_name,
+    trust_remote_code=True,
+    pad_token="<|endoftext|>"
+)
+
+# 5.Save merged lora model
+merged_model.save_pretrained(
+    merged_model_path,
+    safe_serialization=True
+)
+tokenizer.save_pretrained(merged_model_path)
+
+# 6.Run Inference
+test_input = tokenizer("Instruction: Finding prime numbers up to 100\nAnswer:", return_tensors="pt").to("cuda")
+output = merged_model.generate(**test_input, max_new_tokens=100)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
 #### Usage
 
 After preparing the dataset and model weights, you can run the script with the following command: