diff --git a/applications/Colossal-LLaMA-2/inference_example.py b/applications/Colossal-LLaMA-2/inference_example.py index 7fe2d92ab..f6c2e2208 100644 --- a/applications/Colossal-LLaMA-2/inference_example.py +++ b/applications/Colossal-LLaMA-2/inference_example.py @@ -1,17 +1,16 @@ import argparse -import os import torch +from colossal_llama2.dataset.conversation import default_conversation +from transformers import AutoModelForCausalLM, AutoTokenizer + from colossalai.logging import get_dist_logger -from transformers import AutoTokenizer, AutoModelForCausalLM logger = get_dist_logger() def load_model(model_path, device="cuda", **kwargs): - logger.info( - "Please check whether the tokenizer and model weights are properly stored in the same folder." - ) + logger.info("Please check whether the tokenizer and model weights are properly stored in the same folder.") model = AutoModelForCausalLM.from_pretrained(model_path, **kwargs) model.to(device) @@ -27,31 +26,50 @@ def load_model(model_path, device="cuda", **kwargs): def generate(args): model, tokenizer = load_model(model_path=args.model_path, device=args.device) - BASE_INFERENCE_SUFFIX = "\n\n->\n\n" - input_txt = f"{args.input_txt}{BASE_INFERENCE_SUFFIX}" + if args.prompt_style == "sft": + conversation = default_conversation.copy() + conversation.append_message("Human", args.input_txt) + input_txt = conversation.get_prompt() + else: + BASE_INFERENCE_SUFFIX = "\n\n->\n\n" + input_txt = f"{args.input_txt}{BASE_INFERENCE_SUFFIX}" - inputs = tokenizer(args.input_txt, return_tensors='pt').to(args.device) - output = model.generate(**inputs, - max_new_tokens=args.max_new_tokens, - do_sample=args.do_sample, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - num_return_sequences=1) - response = tokenizer.decode(output.cpu()[0], skip_special_tokens=True)[len(input_txt):] + inputs = tokenizer(input_txt, return_tensors="pt").to(args.device) + num_input_tokens = inputs["input_ids"].shape[-1] + output = model.generate( + **inputs, + max_new_tokens=args.max_new_tokens, + do_sample=args.do_sample, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + num_return_sequences=1, + ) + response = tokenizer.decode(output.cpu()[0, num_input_tokens:], skip_special_tokens=True) logger.info(f"Question: {input_txt} \n\n Answer: \n{response}") return response if __name__ == "__main__": parser = argparse.ArgumentParser(description="Colossal-LLaMA-2 inference Process.") - parser.add_argument('--model_path', type=str, default="hpcai-tech/Colossal-LLaMA-2-7b-base", help="HF repo name or local path of the model") - parser.add_argument('--device', type=str, default="cuda:0", help="Set the device") - parser.add_argument('--max_new_tokens', type=int, default=512, help=" Set maximum numbers of tokens to generate, ignoring the number of tokens in the prompt") - parser.add_argument('--do_sample', type=bool, default=True, help="Set whether or not to use sampling") - parser.add_argument('--temperature', type=float, default=0.3, help="Set temperature value") - parser.add_argument('--top_k', type=int, default=50, help="Set top_k value for top-k-filtering") - parser.add_argument('--top_p', type=int, default=0.95, help="Set top_p value for generation") - parser.add_argument('--input_txt', type=str, default="明月松间照,", help="The prompt input to the model") + parser.add_argument( + "--model_path", + type=str, + default="hpcai-tech/Colossal-LLaMA-2-7b-base", + help="HF repo name or local path of the model", + ) + parser.add_argument("--device", type=str, default="cuda:0", help="Set the device") + parser.add_argument( + "--max_new_tokens", + type=int, + default=512, + help=" Set maximum numbers of tokens to generate, ignoring the number of tokens in the prompt", + ) + parser.add_argument("--do_sample", type=bool, default=True, help="Set whether or not to use sampling") + parser.add_argument("--temperature", type=float, default=0.3, help="Set temperature value") + parser.add_argument("--top_k", type=int, default=50, help="Set top_k value for top-k-filtering") + parser.add_argument("--top_p", type=int, default=0.95, help="Set top_p value for generation") + parser.add_argument("--input_txt", type=str, default="明月松间照,", help="The prompt input to the model") + parser.add_argument("--prompt_style", choices=["sft", "pretrained"], default="sft", help="The style of the prompt") args = parser.parse_args() - generate(args) \ No newline at end of file + generate(args) diff --git a/applications/Colossal-LLaMA-2/train.py b/applications/Colossal-LLaMA-2/train.py index 20ec2a7c8..2e4bab75a 100644 --- a/applications/Colossal-LLaMA-2/train.py +++ b/applications/Colossal-LLaMA-2/train.py @@ -154,6 +154,7 @@ def main() -> None: precision=args.mixed_precision, initial_scale=2**16, max_norm=args.grad_clip, + enable_gradient_accumulation=(args.accumulation_steps > 1), ) elif args.plugin == "gemini_auto": plugin = GeminiPlugin( @@ -161,6 +162,7 @@ def main() -> None: placement_policy="auto", initial_scale=2**16, max_norm=args.grad_clip, + enable_gradient_accumulation=(args.accumulation_steps > 1), ) elif args.plugin == "zero2": plugin = LowLevelZeroPlugin( diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 79831cf33..bc6c9d088 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -726,11 +726,13 @@ class GeminiDDP(ModelWrapper): chunk.cpu_shard.copy_(temp_chunk[chunk.shard_begin : chunk.shard_end]) del temp_chunk - if self.reuse_fp16_chunk: - for chunk_32 in chunk_list: - chunk_16 = chunk_32.paired_chunk - assert chunk_16 is not None - chunk_16.payload.copy_(chunk_32.payload) + + # sync running weights and master weights + if self.master_weights: + for loaded_chunk in chunk_list: + paired_chunk = loaded_chunk.paired_chunk + assert paired_chunk is not None + paired_chunk.payload.copy_(loaded_chunk.payload) for name, buf in persistent_buffers.items(): if buf is not None: