training code done

This commit is contained in:
wl-zhao
2024-03-10 13:05:02 +00:00
parent c9c57a17f4
commit 7ade7b740e
16 changed files with 1533 additions and 47 deletions

19
melo/train.sh Normal file
View File

@@ -0,0 +1,19 @@
CONFIG=$1
GPUS=$2
MODEL_NAME=$(basename "$(dirname $CONFIG)")
PORT=10902
while : # auto-resume: the code sometimes crash due to bug of gloo on some gpus
do
torchrun --nproc_per_node=$GPUS \
--master_port=$PORT \
train.py --c $CONFIG --model $MODEL_NAME
for PID in $(ps -aux | grep $CONFIG | grep python | awk '{print $2}')
do
echo $PID
kill -9 $PID
done
sleep 30
done