training code done
This commit is contained in:
19
melo/train.sh
Normal file
19
melo/train.sh
Normal file
@@ -0,0 +1,19 @@
|
||||
CONFIG=$1
|
||||
GPUS=$2
|
||||
MODEL_NAME=$(basename "$(dirname $CONFIG)")
|
||||
|
||||
PORT=10902
|
||||
|
||||
while : # auto-resume: the code sometimes crash due to bug of gloo on some gpus
|
||||
do
|
||||
torchrun --nproc_per_node=$GPUS \
|
||||
--master_port=$PORT \
|
||||
train.py --c $CONFIG --model $MODEL_NAME
|
||||
|
||||
for PID in $(ps -aux | grep $CONFIG | grep python | awk '{print $2}')
|
||||
do
|
||||
echo $PID
|
||||
kill -9 $PID
|
||||
done
|
||||
sleep 30
|
||||
done
|
||||
Reference in New Issue
Block a user