[example] add llama2 example (#4527)

* [example] transfer llama-1 example

* [example] fit llama-2

* [example] refactor scripts folder

* [example] fit new gemini plugin

* [cli] fix multinode runner

* [example] fit gemini optim checkpoint

* [example] refactor scripts

* [example] update requirements

* [example] update requirements

* [example] rename llama to llama2

* [example] update readme and pretrain script

* [example] refactor scripts
This commit is contained in:
Hongxin Liu
2023-08-28 17:59:11 +08:00
committed by GitHub
parent 839847b7d7
commit 0b00def881
17 changed files with 1087 additions and 19 deletions

View File

@@ -265,6 +265,10 @@ def launch_multi_processes(args: Config) -> None:
# establish remote connection
runner.connect(host_info_list=active_device_pool, workdir=curr_path, env=env)
# overwrite master addr when num_nodes > 1 and not specified
if len(active_device_pool) > 1 and args.master_addr == "127.0.0.1":
args.master_addr = active_device_pool.hostinfo_list[0].hostname
# execute distributed launching command
for node_id, hostinfo in enumerate(active_device_pool):
cmd = get_launch_command(master_addr=args.master_addr,