support DDP in webui
Former-commit-id: d059262ff8dc857f597d2657546ec625726a664a
This commit is contained in:
@@ -110,19 +110,20 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l
|
||||
#### Supervised Fine-Tuning with Accelerate on Single Node
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/single_node.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/multi_node.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/ds_zero3.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
|
||||
```
|
||||
|
||||
### LoRA Fine-Tuning on Multiple NPUs
|
||||
@@ -130,7 +131,7 @@ bash examples/lora_multi_gpu/ds_zero3.sh
|
||||
#### Supervised Fine-Tuning with DeepSpeed ZeRO-0
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_npu/ds_zero0.sh
|
||||
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
|
||||
```
|
||||
|
||||
### Full-Parameter Fine-Tuning on Multiple GPUs
|
||||
@@ -138,19 +139,20 @@ bash examples/lora_multi_npu/ds_zero0.sh
|
||||
#### Supervised Fine-Tuning with Accelerate on Single Node
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/single_node.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/multi_node.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### Batch Predicting and Computing BLEU and ROUGE Scores
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/predict.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_predict.yaml
|
||||
```
|
||||
|
||||
### Merging LoRA Adapters and Quantization
|
||||
|
||||
@@ -110,19 +110,20 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l
|
||||
#### 使用 Accelerate 进行单节点训练
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/single_node.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用 Accelerate 进行多节点训练
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/multi_node.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用 DeepSpeed ZeRO-3 平均分配显存
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_gpu/ds_zero3.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
|
||||
```
|
||||
|
||||
### 多 NPU LoRA 微调
|
||||
@@ -130,7 +131,7 @@ bash examples/lora_multi_gpu/ds_zero3.sh
|
||||
#### 使用 DeepSpeed ZeRO-0 训练
|
||||
|
||||
```bash
|
||||
bash examples/lora_multi_npu/ds_zero0.sh
|
||||
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
|
||||
```
|
||||
|
||||
### 多 GPU 全参数微调
|
||||
@@ -138,19 +139,20 @@ bash examples/lora_multi_npu/ds_zero0.sh
|
||||
#### 使用 DeepSpeed 进行单节点训练
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/single_node.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### 使用 DeepSpeed 进行多节点训练
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/multi_node.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
||||
```
|
||||
|
||||
#### 批量预测并计算 BLEU 和 ROUGE 分数
|
||||
|
||||
```bash
|
||||
bash examples/full_multi_gpu/predict.sh
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_predict.yaml
|
||||
```
|
||||
|
||||
### 合并 LoRA 适配器与模型量化
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: MULTI_GPU
|
||||
downcast_bf16: 'no'
|
||||
gpu_ids: all
|
||||
machine_rank: 0
|
||||
main_process_ip: 192.168.0.1
|
||||
main_process_port: 29555
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 2 # the number of nodes
|
||||
num_processes: 8 # the number of GPUs in all nodes
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
@@ -1,16 +0,0 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: MULTI_GPU
|
||||
downcast_bf16: 'no'
|
||||
gpu_ids: all
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 1 # the number of nodes
|
||||
num_processes: 4 # the number of GPUs in all nodes
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
@@ -1,18 +0,0 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: MULTI_GPU
|
||||
downcast_bf16: 'no'
|
||||
gpu_ids: all
|
||||
machine_rank: 1
|
||||
main_process_ip: 192.168.0.1
|
||||
main_process_port: 29555
|
||||
main_training_function: main
|
||||
mixed_precision: fp16
|
||||
num_machines: 2 # the number of nodes
|
||||
num_processes: 8 # the number of GPUs in all nodes
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
NPROC_PER_NODE=4
|
||||
NNODES=2
|
||||
RANK=0
|
||||
MASTER_ADDR=192.168.0.1
|
||||
MASTER_PORT=29500
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||
--nproc_per_node $NPROC_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT \
|
||||
src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
|
||||
@@ -1,5 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
||||
--config_file examples/accelerate/single_config.yaml \
|
||||
src/train.py examples/full_multi_gpu/llama3_full_predict.yaml
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
NPROC_PER_NODE=4
|
||||
NNODES=1
|
||||
RANK=0
|
||||
MASTER_ADDR=127.0.0.1
|
||||
MASTER_PORT=29500
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||
--nproc_per_node $NPROC_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT \
|
||||
src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
NPROC_PER_NODE=4
|
||||
NNODES=1
|
||||
RANK=0
|
||||
MASTER_ADDR=127.0.0.1
|
||||
MASTER_PORT=29500
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||
--nproc_per_node $NPROC_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT \
|
||||
src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
|
||||
@@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
# also launch it on slave machine using slave_config.yaml
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
||||
--config_file examples/accelerate/master_config.yaml \
|
||||
src/train.py examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||
@@ -1,5 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
||||
--config_file examples/accelerate/single_config.yaml \
|
||||
src/train.py examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
NPROC_PER_NODE=4
|
||||
NNODES=1
|
||||
RANK=0
|
||||
MASTER_ADDR=127.0.0.1
|
||||
MASTER_PORT=29500
|
||||
|
||||
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||
--nproc_per_node $NPROC_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT \
|
||||
src/train.py examples/lora_multi_npu/llama3_lora_sft_ds.yaml
|
||||
Reference in New Issue
Block a user