diff --git a/docs/source/en/features/gradient_accumulation_with_booster.md b/docs/source/en/features/gradient_accumulation_with_booster.md index 201e3bc2b..7bc4eb47b 100644 --- a/docs/source/en/features/gradient_accumulation_with_booster.md +++ b/docs/source/en/features/gradient_accumulation_with_booster.md @@ -103,10 +103,12 @@ for idx, (img, label) in enumerate(train_dataloader): with sync_context: output = model(img) train_loss = criterion(output, label) + train_loss = train_loss / GRADIENT_ACCUMULATION booster.backward(train_loss, optimizer) else: output = model(img) train_loss = criterion(output, label) + train_loss = train_loss / GRADIENT_ACCUMULATION booster.backward(train_loss, optimizer) optimizer.step() optimizer.zero_grad() diff --git a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md index a8422060f..d121b161b 100644 --- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md +++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md @@ -106,10 +106,12 @@ for idx, (img, label) in enumerate(train_dataloader): with sync_context: output = model(img) train_loss = criterion(output, label) + train_loss = train_loss / GRADIENT_ACCUMULATION booster.backward(train_loss, optimizer) else: output = model(img) train_loss = criterion(output, label) + train_loss = train_loss / GRADIENT_ACCUMULATION booster.backward(train_loss, optimizer) optimizer.step() optimizer.zero_grad()