[fix] fix bwd b; now bwd w only for Layer replaced by Linear1D_Col/Row; other layer perform a fully bwd;

This commit is contained in:
duanjunwen
2024-10-15 06:26:01 +00:00
parent 160e9a4175
commit 9912cc8c07
4 changed files with 11 additions and 11 deletions

View File

@@ -509,12 +509,11 @@ class ZeroBubbleVPipeScheduler(PipelineSchedule):
optimizer.backward_by_grad(
tensor=output_obj_,
grad=output_obj_grad_,
inputs=input_obj_,
retain_graph=True,
# inputs=input_obj_,
# retain_graph=True,
)
# Format output_obj_grad
input_obj_grad = {}
input_obj_grad = dict()
if model_chunk_id == 0 and self.stage_manager.is_first_stage(ignore_chunk=True):
pass
else:
@@ -714,7 +713,6 @@ class ZeroBubbleVPipeScheduler(PipelineSchedule):
# # we save output_tensor_grad here
# self.output_tensors_grad_dw[model_chunk_id].append(output_tensor_grad)
# Step2: bwd step
input_object_grad = self.backward_b_step(
model_chunk=model_chunk,
model_chunk_id=model_chunk_id,
@@ -761,7 +759,6 @@ class ZeroBubbleVPipeScheduler(PipelineSchedule):
# get y & dy from buffer
# output_obj = self.output_tensors_dw[model_chunk_id].pop(0)
# output_obj_grad = self.output_tensors_grad_dw[model_chunk_id].pop(0)
WeightGradStore.pop(chunk=model_chunk_id)
# self.backward_w_step(