mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-04-29 20:25:29 +00:00
Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
46ed5d856b | ||
|
7ecdf9a211 | ||
|
44d4053fec |
6
.github/workflows/build_on_pr.yml
vendored
6
.github/workflows/build_on_pr.yml
vendored
@ -87,10 +87,10 @@ jobs:
|
||||
name: Build and Test Colossal-AI
|
||||
needs: detect
|
||||
if: needs.detect.outputs.anyLibraryFileChanged == 'true'
|
||||
runs-on: [self-hosted, gpu]
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
|
||||
options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch
|
||||
image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
|
||||
options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch
|
||||
timeout-minutes: 90
|
||||
defaults:
|
||||
run:
|
||||
|
@ -38,7 +38,7 @@ Limited Academic Bonuses:
|
||||
|
||||
<div align="center">
|
||||
<a href="https://hpc-ai.com/?utm_source=github&utm_medium=social&utm_campaign=promotion-colossalai">
|
||||
<img src="https://github.com/hpcaitech/public_assets/blob/main/colossalai/img/2.gif" width="850" />
|
||||
<img src="https://github.com/hpcaitech/public_assets/blob/main/colossalai/img/2-2.gif" width="850" />
|
||||
</a>
|
||||
</div>
|
||||
|
||||
|
@ -892,6 +892,63 @@ The dialogues can by multiple turns and it can contain system prompt. For more d
|
||||
|
||||
We use bf16 weights for finetuning. If you downloaded fp8 DeepSeek V3/R1 weights, you can use the [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) to convert the weights to bf16 via GPU. For Ascend NPU, you can use this [script](https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/LLM/DeepSeek/DeepSeek-V2/NPU_inference/fp8_cast_bf16.py).
|
||||
|
||||
We have also added details on how to load and reason with lora models.
|
||||
```python
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
)
|
||||
from peft import (
|
||||
PeftModel
|
||||
)
|
||||
import torch
|
||||
|
||||
# Set model path
|
||||
model_name = "Qwen/Qwen2.5-3B"
|
||||
lora_adapter = "Qwen2.5-3B_lora" # Your lora model Path
|
||||
merged_model_path = "Qwen2.5-3B_merged"
|
||||
|
||||
######
|
||||
# How to Load lora Model
|
||||
######
|
||||
# 1.Load base model
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
trust_remote_code=True
|
||||
)
|
||||
|
||||
# 2.Load lora model
|
||||
peft_model = PeftModel.from_pretrained(
|
||||
base_model,
|
||||
lora_adapter,
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# 3.Merge lora model
|
||||
merged_model = peft_model.merge_and_unload()
|
||||
|
||||
# 4.Load tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
pad_token="<|endoftext|>"
|
||||
)
|
||||
|
||||
# 5.Save merged lora model
|
||||
merged_model.save_pretrained(
|
||||
merged_model_path,
|
||||
safe_serialization=True
|
||||
)
|
||||
tokenizer.save_pretrained(merged_model_path)
|
||||
|
||||
# 6.Run Inference
|
||||
test_input = tokenizer("Instruction: Finding prime numbers up to 100\nAnswer:", return_tensors="pt").to("cuda")
|
||||
output = merged_model.generate(**test_input, max_new_tokens=100)
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
```
|
||||
|
||||
#### Usage
|
||||
|
||||
After preparing the dataset and model weights, you can run the script with the following command:
|
||||
|
@ -6,9 +6,10 @@ from torch.testing import assert_close
|
||||
from colossalai import launch
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.quantization.fp8 import all_to_all_single_fp8
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize("shape", [(4,), (1, 8, 16), (4, 8, 16)])
|
||||
@parameterize("dtype", [torch.bfloat16, torch.float16])
|
||||
@parameterize("async_op", [True, False])
|
||||
@ -24,6 +25,7 @@ def check_all2all(shape, dtype, async_op):
|
||||
assert_close(output, output_fp8, rtol=0.1, atol=0.1)
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize("shape", [(8, 8, 16)])
|
||||
@parameterize("dtype", [torch.bfloat16, torch.float16])
|
||||
@parameterize("async_op", [True, False])
|
||||
|
@ -6,9 +6,10 @@ from torch.testing import assert_close
|
||||
from colossalai import launch
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.quantization.fp8 import _all_to_all_fp8
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize("shape", [(16, 8, 4)])
|
||||
@parameterize("scatter_dim", [0, 1, 2])
|
||||
@parameterize("dtype", [torch.bfloat16, torch.float16])
|
||||
|
@ -6,11 +6,12 @@ from torch.testing import assert_close
|
||||
from colossalai import launch
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.quantization.fp8 import all_to_all_single_fp8
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
dist.all_to_all_single
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize("shape", [(4), (8, 7), (4, 8, 16)])
|
||||
@parameterize("dtype", [torch.bfloat16, torch.float16])
|
||||
@parameterize("fp8_format", ["e4m3", "e5m2"])
|
||||
|
@ -6,9 +6,10 @@ from torch.testing import assert_close
|
||||
from colossalai import launch
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.quantization.fp8 import _all_gather_fp8
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize(
|
||||
"shape",
|
||||
[(3, 7, 16)],
|
||||
|
@ -5,7 +5,7 @@ from torch.testing import assert_close
|
||||
from colossalai import launch
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.quantization.fp8 import all_reduce_fp8
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
@parameterize(
|
||||
@ -20,6 +20,7 @@ from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
(8,),
|
||||
],
|
||||
)
|
||||
@clear_cache_before_run()
|
||||
@parameterize("dtype", [torch.float16, torch.bfloat16])
|
||||
@parameterize("fp8_format", ["e4m3", "e5m2"])
|
||||
@parameterize("async_op", [True, False])
|
||||
|
@ -3,9 +3,10 @@ from torch.testing import assert_close
|
||||
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.quantization.fp8 import cast_from_fp8, cast_from_fp8_pipeline, cast_to_fp8, cast_to_fp8_pipeline
|
||||
from colossalai.testing import parameterize
|
||||
from colossalai.testing import clear_cache_before_run, parameterize
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize("shape", [(100, 10), (10, 100), (3, 7), (2, 1), (1, 2), (2, 2), (4, 2), (5,), (4,), (2,)])
|
||||
@parameterize("dtype", [torch.bfloat16, torch.float16, torch.float32])
|
||||
@parameterize("fp8_format", ["e4m3", "e5m2"])
|
||||
|
@ -8,7 +8,7 @@ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
||||
from torch.testing import assert_close
|
||||
|
||||
from colossalai import launch
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
# example modified from https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
|
||||
|
||||
@ -28,6 +28,7 @@ class ToyModel(nn.Module):
|
||||
return self.net2(self.relu(self.net1(x)))
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize("mode", ["grad", "params"])
|
||||
def run_model(mode):
|
||||
rank = dist.get_rank()
|
||||
|
@ -6,9 +6,10 @@ from torch.testing import assert_close
|
||||
from colossalai import launch
|
||||
from colossalai.accelerator import get_accelerator
|
||||
from colossalai.quantization.fp8 import reduce_scatter_fp8
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
@clear_cache_before_run()
|
||||
@parameterize("shape", [(16, 8, 4)])
|
||||
@parameterize("scatter_dim", [0, 1, 2])
|
||||
@parameterize("dtype", [torch.bfloat16, torch.float16])
|
||||
|
Loading…
Reference in New Issue
Block a user