mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 09:38:05 +00:00
[pipeline]: support arbitrary batch size in forward_only mode (#5201)
* fix: remove drop last in val & test dataloader * feat: add run_forward_only, support arbitrary bs * chore: modify ci script
This commit is contained in:
@@ -88,24 +88,21 @@ class GLUEDataBuilder:
|
||||
)
|
||||
|
||||
def val_dataloader(self):
|
||||
# TODO: drop_last is set to True for now to avoid error when using PP
|
||||
# as the last batch may not be divisible by the number of microbatches
|
||||
if len(self.eval_splits) == 1:
|
||||
return self.plugin.prepare_dataloader(
|
||||
self.dataset["validation"], batch_size=self.eval_batch_size, drop_last=True
|
||||
)
|
||||
return self.plugin.prepare_dataloader(self.dataset["validation"], batch_size=self.eval_batch_size)
|
||||
elif len(self.eval_splits) > 1:
|
||||
return [
|
||||
self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size, drop_last=True)
|
||||
self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
|
||||
for x in self.eval_splits
|
||||
]
|
||||
|
||||
def test_dataloader(self):
|
||||
if len(self.eval_splits) == 1:
|
||||
return self.plugin.prepare_dataloader(self.dataset["test"], batch_size=self.eval_batch_size, drop_last=True)
|
||||
return self.plugin.prepare_dataloader(self.dataset["test"], batch_size=self.eval_batch_size)
|
||||
elif len(self.eval_splits) > 1:
|
||||
return [
|
||||
self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size, drop_last=True)
|
||||
self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size)
|
||||
for x in self.eval_splits
|
||||
]
|
||||
|
||||
|
@@ -1,8 +1,17 @@
|
||||
#!/bin/bash
|
||||
set -xe
|
||||
set -x
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
FAIL_LIMIT=3
|
||||
|
||||
for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero" "hybrid_parallel"; do
|
||||
torchrun --standalone --nproc_per_node 4 finetune.py --target_f1 0.86 --plugin $plugin --model_type "bert"
|
||||
for i in $(seq 1 $FAIL_LIMIT); do
|
||||
torchrun --standalone --nproc_per_node 4 finetune.py --target_f1 0.86 --plugin $plugin --model_type "bert" && break
|
||||
echo "Failed $i times"
|
||||
if [ $i -eq $FAIL_LIMIT ]; then
|
||||
echo "Failed $FAIL_LIMIT times, exiting"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
Reference in New Issue
Block a user