mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-06-24 22:42:15 +00:00
[hotfix] set return_outputs=False in examples and polish code (#5404)
* fix: simplify merge_batch * fix: use return_outputs=False to eliminate extra memory consumption * feat: add return_outputs warning * style: remove `return_outputs=False` as it is the default value
This commit is contained in:
parent
5fcd7795cd
commit
bb0a668fee
@ -238,7 +238,6 @@ def main():
|
|||||||
lambda x, y: x.loss,
|
lambda x, y: x.loss,
|
||||||
optimizer,
|
optimizer,
|
||||||
return_loss=True,
|
return_loss=True,
|
||||||
return_outputs=True,
|
|
||||||
)
|
)
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_stage:
|
if is_pp_last_stage:
|
||||||
|
@ -1183,6 +1183,9 @@ class HybridParallelPlugin(PipelinePluginBase):
|
|||||||
) -> dict:
|
) -> dict:
|
||||||
assert self.enable_pipeline_parallelism, "pipeline parallelism is not enabled"
|
assert self.enable_pipeline_parallelism, "pipeline parallelism is not enabled"
|
||||||
|
|
||||||
|
if return_outputs:
|
||||||
|
warnings.warn("return_outputs may lead to significant extra memory consumption.")
|
||||||
|
|
||||||
# Create a context for gradient synchronization based on the optimizer type.
|
# Create a context for gradient synchronization based on the optimizer type.
|
||||||
# If it's a HybridParallelZeroOptimizer, use optimizer.no_sync(); otherwise, use model.no_sync().
|
# If it's a HybridParallelZeroOptimizer, use optimizer.no_sync(); otherwise, use model.no_sync().
|
||||||
# This is to avoid redundant gradient reduction in pipeline parallelism (multiple microbatch values should be reduced once),
|
# This is to avoid redundant gradient reduction in pipeline parallelism (multiple microbatch values should be reduced once),
|
||||||
|
@ -7,7 +7,7 @@ from torch.nn import Module
|
|||||||
from torch.utils._pytree import tree_map
|
from torch.utils._pytree import tree_map
|
||||||
|
|
||||||
from colossalai.accelerator import get_accelerator
|
from colossalai.accelerator import get_accelerator
|
||||||
from colossalai.interface import ModelWrapper, OptimizerWrapper
|
from colossalai.interface import OptimizerWrapper
|
||||||
from colossalai.pipeline.p2p import PipelineP2PCommunication, create_send_metadata
|
from colossalai.pipeline.p2p import PipelineP2PCommunication, create_send_metadata
|
||||||
from colossalai.pipeline.stage_manager import PipelineStageManager
|
from colossalai.pipeline.stage_manager import PipelineStageManager
|
||||||
from colossalai.utils import get_current_device
|
from colossalai.utils import get_current_device
|
||||||
@ -327,9 +327,7 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
|
|||||||
self.send_forward(output_obj)
|
self.send_forward(output_obj)
|
||||||
|
|
||||||
if outputs is not None:
|
if outputs is not None:
|
||||||
if isinstance(model, ModelWrapper):
|
outputs = merge_batch(outputs)
|
||||||
model = model.unwrap()
|
|
||||||
outputs = merge_batch(outputs, getattr(model, "batch_size_dim", 0))
|
|
||||||
return {"loss": accum_loss, "outputs": outputs}
|
return {"loss": accum_loss, "outputs": outputs}
|
||||||
|
|
||||||
def run_forward_backward(
|
def run_forward_backward(
|
||||||
@ -412,9 +410,7 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
|
|||||||
assert all(len(v) == 0 for v in input_objs) and all(len(v) == 0 for v in output_objs)
|
assert all(len(v) == 0 for v in input_objs) and all(len(v) == 0 for v in output_objs)
|
||||||
|
|
||||||
if outputs is not None:
|
if outputs is not None:
|
||||||
if isinstance(model, ModelWrapper):
|
outputs = merge_batch(outputs)
|
||||||
model = model.unwrap()
|
|
||||||
outputs = merge_batch(outputs, getattr(model, "batch_size_dim", 0))
|
|
||||||
return {"loss": accum_loss, "outputs": outputs}
|
return {"loss": accum_loss, "outputs": outputs}
|
||||||
|
|
||||||
def forward_backward_step(
|
def forward_backward_step(
|
||||||
|
@ -178,7 +178,7 @@ def train_epoch(
|
|||||||
for _ in pbar:
|
for _ in pbar:
|
||||||
if use_pipeline:
|
if use_pipeline:
|
||||||
outputs = booster.execute_pipeline(
|
outputs = booster.execute_pipeline(
|
||||||
train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
|
train_dataloader_iter, model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_stage:
|
if is_pp_last_stage:
|
||||||
|
@ -231,7 +231,7 @@ def run_forward_backward(
|
|||||||
if isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1:
|
if isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1:
|
||||||
# run pipeline forward backward when enabling pp in hybrid parallel plugin
|
# run pipeline forward backward when enabling pp in hybrid parallel plugin
|
||||||
output_dict = booster.execute_pipeline(
|
output_dict = booster.execute_pipeline(
|
||||||
data_iter, model, criterion, optimizer, return_loss=True, return_outputs=True
|
data_iter, model, criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
loss, outputs = output_dict["loss"], output_dict["outputs"]
|
loss, outputs = output_dict["loss"], output_dict["outputs"]
|
||||||
else:
|
else:
|
||||||
|
@ -198,8 +198,7 @@ def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, _criterion:
|
|||||||
model,
|
model,
|
||||||
_criterion,
|
_criterion,
|
||||||
optimizer,
|
optimizer,
|
||||||
return_loss=True,
|
return_loss=True)
|
||||||
return_outputs=True)
|
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_stage:
|
if is_pp_last_stage:
|
||||||
loss = outputs['loss']
|
loss = outputs['loss']
|
||||||
|
@ -271,7 +271,7 @@ However, if pipeline parallel is enabled, there are several usages different fro
|
|||||||
3. Do forward and backward passing through calling `Booster.execute_pipeline` method:
|
3. Do forward and backward passing through calling `Booster.execute_pipeline` method:
|
||||||
```python
|
```python
|
||||||
outputs = booster.execute_pipeline(
|
outputs = booster.execute_pipeline(
|
||||||
train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
|
train_dataloader_iter, model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
Backward passing has been completed by this method, so there is no need to call `loss.backward()` after executing this method.
|
Backward passing has been completed by this method, so there is no need to call `loss.backward()` after executing this method.
|
||||||
|
@ -175,7 +175,7 @@ def train_epoch(
|
|||||||
for _ in pbar:
|
for _ in pbar:
|
||||||
if use_pipeline:
|
if use_pipeline:
|
||||||
outputs = booster.execute_pipeline(
|
outputs = booster.execute_pipeline(
|
||||||
train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
|
train_dataloader_iter, model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_stage:
|
if is_pp_last_stage:
|
||||||
|
@ -234,7 +234,7 @@ def run_forward_backward(
|
|||||||
if isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1:
|
if isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1:
|
||||||
# run pipeline forward backward when enabling pp in hybrid parallel plugin
|
# run pipeline forward backward when enabling pp in hybrid parallel plugin
|
||||||
output_dict = booster.execute_pipeline(
|
output_dict = booster.execute_pipeline(
|
||||||
data_iter, model, criterion, optimizer, return_loss=True, return_outputs=True
|
data_iter, model, criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
loss, outputs = output_dict["loss"], output_dict["outputs"]
|
loss, outputs = output_dict["loss"], output_dict["outputs"]
|
||||||
else:
|
else:
|
||||||
|
@ -193,8 +193,7 @@ def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, _criterion:
|
|||||||
model,
|
model,
|
||||||
_criterion,
|
_criterion,
|
||||||
optimizer,
|
optimizer,
|
||||||
return_loss=True,
|
return_loss=True)
|
||||||
return_outputs=True)
|
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_stage:
|
if is_pp_last_stage:
|
||||||
loss = outputs['loss']
|
loss = outputs['loss']
|
||||||
|
@ -264,7 +264,7 @@ elif args.plugin == "hybrid_parallel":
|
|||||||
3. 通过调用`Booster.execute_pipeline` 方法来执行前向和后向传递:
|
3. 通过调用`Booster.execute_pipeline` 方法来执行前向和后向传递:
|
||||||
```python
|
```python
|
||||||
outputs = booster.execute_pipeline(
|
outputs = booster.execute_pipeline(
|
||||||
train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
|
train_dataloader_iter, model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
该方法会自动执行后向传递,所以在执行该方法后不需要再调用 `loss.backward()`方法。
|
该方法会自动执行后向传递,所以在执行该方法后不需要再调用 `loss.backward()`方法。
|
||||||
|
@ -120,7 +120,7 @@ def main():
|
|||||||
# run pipeline forward backward
|
# run pipeline forward backward
|
||||||
batch = iter([batch])
|
batch = iter([batch])
|
||||||
outputs = booster.execute_pipeline(
|
outputs = booster.execute_pipeline(
|
||||||
batch, model, criterion, optimizer, return_loss=True, return_outputs=True
|
batch, model, criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
|
@ -148,7 +148,7 @@ def train_epoch(
|
|||||||
for _ in pbar:
|
for _ in pbar:
|
||||||
if use_pipeline:
|
if use_pipeline:
|
||||||
outputs = booster.execute_pipeline(
|
outputs = booster.execute_pipeline(
|
||||||
train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
|
train_dataloader_iter, model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_device:
|
if is_pp_last_device:
|
||||||
|
@ -145,7 +145,7 @@ def train_epoch(
|
|||||||
for _ in pbar:
|
for _ in pbar:
|
||||||
if use_pipeline:
|
if use_pipeline:
|
||||||
outputs = booster.execute_pipeline(
|
outputs = booster.execute_pipeline(
|
||||||
train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
|
train_dataloader_iter, model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_stage:
|
if is_pp_last_stage:
|
||||||
|
@ -271,7 +271,7 @@ def main():
|
|||||||
for step in pbar:
|
for step in pbar:
|
||||||
if use_pipeline:
|
if use_pipeline:
|
||||||
outputs = booster.execute_pipeline(
|
outputs = booster.execute_pipeline(
|
||||||
dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
|
dataloader_iter, model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
loss = outputs["loss"]
|
loss = outputs["loss"]
|
||||||
else:
|
else:
|
||||||
|
@ -185,7 +185,7 @@ def main():
|
|||||||
microbatch_size=1,
|
microbatch_size=1,
|
||||||
enable_jit_fused=False,
|
enable_jit_fused=False,
|
||||||
zero_stage=0,
|
zero_stage=0,
|
||||||
precision="fp32",
|
precision=args.mixed_precision,
|
||||||
initial_scale=1,
|
initial_scale=1,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -286,7 +286,7 @@ def main():
|
|||||||
for step in pbar:
|
for step in pbar:
|
||||||
if use_pipeline:
|
if use_pipeline:
|
||||||
outputs = booster.execute_pipeline(
|
outputs = booster.execute_pipeline(
|
||||||
dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
|
dataloader_iter, model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
loss = outputs["loss"]
|
loss = outputs["loss"]
|
||||||
else:
|
else:
|
||||||
|
@ -270,7 +270,6 @@ def main():
|
|||||||
lambda x, y: x.loss,
|
lambda x, y: x.loss,
|
||||||
optimizer,
|
optimizer,
|
||||||
return_loss=True,
|
return_loss=True,
|
||||||
return_outputs=True,
|
|
||||||
)
|
)
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_stage:
|
if is_pp_last_stage:
|
||||||
|
@ -340,7 +340,6 @@ def main():
|
|||||||
lambda x, y: x.loss,
|
lambda x, y: x.loss,
|
||||||
optimizer,
|
optimizer,
|
||||||
return_loss=True,
|
return_loss=True,
|
||||||
return_outputs=True,
|
|
||||||
)
|
)
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_stage:
|
if is_pp_last_stage:
|
||||||
|
@ -42,7 +42,7 @@ def train_epoch(epoch, model, optimizer, _criterion, lr_scheduler, dataloader, b
|
|||||||
for _ in pbar:
|
for _ in pbar:
|
||||||
if use_pipeline:
|
if use_pipeline:
|
||||||
outputs = booster.execute_pipeline(
|
outputs = booster.execute_pipeline(
|
||||||
dataloader, model, _criterion, optimizer, return_loss=True, return_outputs=True
|
dataloader, model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_stage:
|
if is_pp_last_stage:
|
||||||
|
@ -74,7 +74,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[
|
|||||||
loss = criterion(outputs[output_key])
|
loss = criterion(outputs[output_key])
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
booster.execute_pipeline(data_iter, model, _criterion, optimizer, return_loss=True, return_outputs=False)
|
booster.execute_pipeline(data_iter, model, _criterion, optimizer, return_loss=True)
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -75,7 +75,7 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
|
|||||||
model.train()
|
model.train()
|
||||||
if booster.plugin.stage_manager is not None:
|
if booster.plugin.stage_manager is not None:
|
||||||
booster.execute_pipeline(
|
booster.execute_pipeline(
|
||||||
_preprocess_data(data), model, _criterion, optimizer, return_loss=True, return_outputs=False
|
_preprocess_data(data), model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
output = model(**_preprocess_data(data))
|
output = model(**_preprocess_data(data))
|
||||||
@ -109,7 +109,7 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
|
|||||||
data_for_origin = data_gen_fn()
|
data_for_origin = data_gen_fn()
|
||||||
if booster.plugin.stage_manager is not None:
|
if booster.plugin.stage_manager is not None:
|
||||||
booster.execute_pipeline(
|
booster.execute_pipeline(
|
||||||
_preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True, return_outputs=False
|
_preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
booster.execute_pipeline(
|
booster.execute_pipeline(
|
||||||
_preprocess_data(data_for_origin),
|
_preprocess_data(data_for_origin),
|
||||||
@ -117,7 +117,6 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
|
|||||||
_criterion,
|
_criterion,
|
||||||
new_optimizer,
|
new_optimizer,
|
||||||
return_loss=True,
|
return_loss=True,
|
||||||
return_outputs=False,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
old_model_loss = criterion(model(**_preprocess_data(data_for_shard)))
|
old_model_loss = criterion(model(**_preprocess_data(data_for_shard)))
|
||||||
|
@ -49,7 +49,6 @@ def run_fwd_bwd(
|
|||||||
lambda x, y: x.loss,
|
lambda x, y: x.loss,
|
||||||
optimizer,
|
optimizer,
|
||||||
return_loss=True,
|
return_loss=True,
|
||||||
return_outputs=True,
|
|
||||||
)
|
)
|
||||||
# Backward and optimize
|
# Backward and optimize
|
||||||
if is_pp_last_stage:
|
if is_pp_last_stage:
|
||||||
|
@ -104,7 +104,7 @@ def run_pp(
|
|||||||
torch_loss.backward()
|
torch_loss.backward()
|
||||||
|
|
||||||
pp_ret = schedule.forward_backward_step(
|
pp_ret = schedule.forward_backward_step(
|
||||||
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True, return_outputs=True
|
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# check loss
|
# check loss
|
||||||
@ -134,7 +134,7 @@ def run_pp(
|
|||||||
torch_loss = criterion(torch_output)
|
torch_loss = criterion(torch_output)
|
||||||
|
|
||||||
pp_ret = schedule.forward_backward_step(
|
pp_ret = schedule.forward_backward_step(
|
||||||
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True, return_outputs=True
|
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
if stage_manager.is_last_stage(ignore_chunk=True):
|
if stage_manager.is_last_stage(ignore_chunk=True):
|
||||||
assert torch.allclose(torch_loss, pp_ret["loss"])
|
assert torch.allclose(torch_loss, pp_ret["loss"])
|
||||||
|
@ -100,7 +100,7 @@ def examine_pp(num_microbatch: int, batch_size: int):
|
|||||||
torch_loss = criterion(torch_output)
|
torch_loss = criterion(torch_output)
|
||||||
torch_loss.backward()
|
torch_loss.backward()
|
||||||
pp_ret = schedule.forward_backward_step(
|
pp_ret = schedule.forward_backward_step(
|
||||||
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True, return_outputs=True
|
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# check loss
|
# check loss
|
||||||
@ -130,7 +130,7 @@ def examine_pp(num_microbatch: int, batch_size: int):
|
|||||||
torch_loss = criterion(torch_output)
|
torch_loss = criterion(torch_output)
|
||||||
|
|
||||||
pp_ret = schedule.forward_backward_step(
|
pp_ret = schedule.forward_backward_step(
|
||||||
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True, return_outputs=True
|
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True
|
||||||
)
|
)
|
||||||
if stage_manager.is_last_stage():
|
if stage_manager.is_last_stage():
|
||||||
assert torch.allclose(torch_loss, pp_ret["loss"])
|
assert torch.allclose(torch_loss, pp_ret["loss"])
|
||||||
|
Loading…
Reference in New Issue
Block a user