fix #3602

Former-commit-id: 1518b45490606ea200482da4737113c46985e8c5
2024-05-07 17:50:27 +08:00
parent 3d74f21738
commit ebab655683
7 changed files with 43 additions and 5 deletions
--- a/examples/full_multi_gpu/multi_node.sh
+++ b/examples/full_multi_gpu/multi_node.sh
@@ -1,6 +1,12 @@
 #!/bin/bash

-python -m torch.distributed.run \
+NPROC_PER_NODE=4
+NNODES=2
+RANK=0
+MASTER_ADDR=192.168.0.1
+MASTER_PORT=29500
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
    --nproc_per_node $NPROC_PER_NODE \
    --nnodes $NNODES \
    --node_rank $RANK \
--- a/examples/full_multi_gpu/single_node.sh
+++ b/examples/full_multi_gpu/single_node.sh
@@ -1,4 +1,9 @@
 #!/bin/bash

-deepspeed --include "localhost:0,1,2,3" \
+NPROC_PER_NODE=4
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes 1 \
+    --standalone \
    src/train.py examples/full_multi_gpu/llama3_full_sft.yaml