From 82d744716a40b5d23b8e90eeb7eca51fa7af2645 Mon Sep 17 00:00:00 2001 From: hiyouga <467089858@qq.com> Date: Mon, 3 Jun 2024 19:12:29 +0800 Subject: [PATCH] fix #4005 #4013 Former-commit-id: 8608fa268cde5cddf8d0c6c2eb2cb5fa246c1831 --- examples/README.md | 8 ++++---- examples/README_zh.md | 10 +++++----- examples/extras/badam/llama3_lora_sft.yaml | 4 ++-- examples/extras/fsdp_qlora/llama3_lora_sft.yaml | 4 ++-- examples/extras/galore/llama3_full_sft.yaml | 4 ++-- examples/extras/llama_pro/llama3_freeze_sft.yaml | 4 ++-- examples/extras/loraplus/llama3_lora_sft.yaml | 4 ++-- examples/extras/mod/llama3_full_sft.yaml | 4 ++-- examples/full_multi_gpu/llama3_full_sft.yaml | 4 ++-- examples/lora_multi_gpu/llama3_lora_sft.yaml | 4 ++-- examples/lora_multi_gpu/llama3_lora_sft_ds.yaml | 4 ++-- examples/lora_multi_npu/llama3_lora_sft_ds.yaml | 4 ++-- examples/lora_single_gpu/llama3_lora_dpo.yaml | 4 ++-- examples/lora_single_gpu/llama3_lora_kto.yaml | 4 ++-- examples/lora_single_gpu/llama3_lora_ppo.yaml | 4 ++-- examples/lora_single_gpu/llama3_lora_pretrain.yaml | 4 ++-- examples/lora_single_gpu/llama3_lora_reward.yaml | 4 ++-- examples/lora_single_gpu/llama3_lora_sft.yaml | 4 ++-- examples/lora_single_gpu/llava1_5_lora_sft.yaml | 4 ++-- examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml | 4 ++-- examples/qlora_single_gpu/llama3_lora_sft_awq.yaml | 4 ++-- .../qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml | 4 ++-- examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml | 4 ++-- src/llamafactory/hparams/model_args.py | 2 +- 24 files changed, 52 insertions(+), 52 deletions(-) diff --git a/examples/README.md b/examples/README.md index 727b27c80..f985d5529 100644 --- a/examples/README.md +++ b/examples/README.md @@ -107,13 +107,13 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l ### LoRA Fine-Tuning on Multiple GPUs -#### Supervised Fine-Tuning with Accelerate on Single Node +#### Supervised Fine-Tuning on Single Node ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml ``` -#### Supervised Fine-Tuning with Accelerate on Multiple Nodes +#### Supervised Fine-Tuning on Multiple Nodes ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml @@ -136,13 +136,13 @@ ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu ### Full-Parameter Fine-Tuning on Multiple GPUs -#### Supervised Fine-Tuning with Accelerate on Single Node +#### Supervised Fine-Tuning on Single Node ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml ``` -#### Supervised Fine-Tuning with Accelerate on Multiple Nodes +#### Supervised Fine-Tuning on Multiple Nodes ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml diff --git a/examples/README_zh.md b/examples/README_zh.md index 6974faa90..cf5bbf49f 100644 --- a/examples/README_zh.md +++ b/examples/README_zh.md @@ -107,13 +107,13 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l ### 多 GPU LoRA 微调 -#### 使用 Accelerate 进行单节点训练 +#### 在单机上进行指令监督微调 ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml ``` -#### 使用 Accelerate 进行多节点训练 +#### 在多机上进行指令监督微调 ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml @@ -128,7 +128,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llam ### 多 NPU LoRA 微调 -#### 使用 DeepSpeed ZeRO-0 训练 +#### 使用 DeepSpeed ZeRO-0 进行指令监督微调 ```bash ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml @@ -136,13 +136,13 @@ ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu ### 多 GPU 全参数微调 -#### 使用 DeepSpeed 进行单节点训练 +#### 在单机上进行指令监督微调 ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml ``` -#### 使用 DeepSpeed 进行多节点训练 +#### 在多机上进行指令监督微调 ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml diff --git a/examples/extras/badam/llama3_lora_sft.yaml b/examples/extras/badam/llama3_lora_sft.yaml index 4a482749f..242e63ab1 100644 --- a/examples/extras/badam/llama3_lora_sft.yaml +++ b/examples/extras/badam/llama3_lora_sft.yaml @@ -28,10 +28,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 pure_bf16: true ### eval diff --git a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml index e9c04fa92..920d8fdb1 100644 --- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml +++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml @@ -29,10 +29,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/extras/galore/llama3_full_sft.yaml b/examples/extras/galore/llama3_full_sft.yaml index 87381fcc1..3db31fed0 100644 --- a/examples/extras/galore/llama3_full_sft.yaml +++ b/examples/extras/galore/llama3_full_sft.yaml @@ -29,10 +29,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 1 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 pure_bf16: true ### eval diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml index 8ace8db8c..214f411ae 100644 --- a/examples/extras/llama_pro/llama3_freeze_sft.yaml +++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml @@ -27,10 +27,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/extras/loraplus/llama3_lora_sft.yaml b/examples/extras/loraplus/llama3_lora_sft.yaml index 26c2b1d2d..9936bcd30 100644 --- a/examples/extras/loraplus/llama3_lora_sft.yaml +++ b/examples/extras/loraplus/llama3_lora_sft.yaml @@ -26,10 +26,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml index 6b724ed0e..edfec44e0 100644 --- a/examples/extras/mod/llama3_full_sft.yaml +++ b/examples/extras/mod/llama3_full_sft.yaml @@ -26,10 +26,10 @@ overwrite_output_dir: true per_device_train_batch_size: 1 gradient_accumulation_steps: 8 optim: paged_adamw_8bit -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 pure_bf16: true ### eval diff --git a/examples/full_multi_gpu/llama3_full_sft.yaml b/examples/full_multi_gpu/llama3_full_sft.yaml index a96f1b8e5..b8873e3ac 100644 --- a/examples/full_multi_gpu/llama3_full_sft.yaml +++ b/examples/full_multi_gpu/llama3_full_sft.yaml @@ -28,10 +28,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 2 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/lora_multi_gpu/llama3_lora_sft.yaml b/examples/lora_multi_gpu/llama3_lora_sft.yaml index 6389f21b3..5e5dd9e6e 100644 --- a/examples/lora_multi_gpu/llama3_lora_sft.yaml +++ b/examples/lora_multi_gpu/llama3_lora_sft.yaml @@ -28,10 +28,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 2 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml index 6011896af..e8dee2163 100644 --- a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml +++ b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml @@ -29,10 +29,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 2 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml index 65ab6347d..825b84503 100644 --- a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml +++ b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml @@ -29,10 +29,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 2 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/lora_single_gpu/llama3_lora_dpo.yaml index f68244b7e..62752e573 100644 --- a/examples/lora_single_gpu/llama3_lora_dpo.yaml +++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml @@ -27,10 +27,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.000005 +learning_rate: 5.0e-6 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/lora_single_gpu/llama3_lora_kto.yaml b/examples/lora_single_gpu/llama3_lora_kto.yaml index 4405aaecd..6f6898185 100644 --- a/examples/lora_single_gpu/llama3_lora_kto.yaml +++ b/examples/lora_single_gpu/llama3_lora_kto.yaml @@ -25,10 +25,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.000005 +learning_rate: 5.0e-6 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/lora_single_gpu/llama3_lora_ppo.yaml b/examples/lora_single_gpu/llama3_lora_ppo.yaml index 88ce24f3e..19e7ccb37 100644 --- a/examples/lora_single_gpu/llama3_lora_ppo.yaml +++ b/examples/lora_single_gpu/llama3_lora_ppo.yaml @@ -26,10 +26,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.00001 +learning_rate: 1.0e-5 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### generate diff --git a/examples/lora_single_gpu/llama3_lora_pretrain.yaml b/examples/lora_single_gpu/llama3_lora_pretrain.yaml index acb18ebf4..54c5d89ae 100644 --- a/examples/lora_single_gpu/llama3_lora_pretrain.yaml +++ b/examples/lora_single_gpu/llama3_lora_pretrain.yaml @@ -24,10 +24,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/lora_single_gpu/llama3_lora_reward.yaml b/examples/lora_single_gpu/llama3_lora_reward.yaml index 6bf2ca024..c82f9414c 100644 --- a/examples/lora_single_gpu/llama3_lora_reward.yaml +++ b/examples/lora_single_gpu/llama3_lora_reward.yaml @@ -25,10 +25,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.00001 +learning_rate: 1.0e-5 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/lora_single_gpu/llama3_lora_sft.yaml b/examples/lora_single_gpu/llama3_lora_sft.yaml index 5492bc342..429cb6afd 100644 --- a/examples/lora_single_gpu/llama3_lora_sft.yaml +++ b/examples/lora_single_gpu/llama3_lora_sft.yaml @@ -25,10 +25,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/lora_single_gpu/llava1_5_lora_sft.yaml b/examples/lora_single_gpu/llava1_5_lora_sft.yaml index 8e4226da1..acab48848 100644 --- a/examples/lora_single_gpu/llava1_5_lora_sft.yaml +++ b/examples/lora_single_gpu/llava1_5_lora_sft.yaml @@ -26,10 +26,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml index d2658051e..53cc12e28 100644 --- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml +++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml @@ -25,10 +25,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml index ba6d8ea59..1a92f8228 100644 --- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml +++ b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml @@ -25,10 +25,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml index a3db35ff1..c7f72c66d 100644 --- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml +++ b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml @@ -26,10 +26,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml index cc9a454e8..45caf17cc 100644 --- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml +++ b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml @@ -25,10 +25,10 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 0.0001 +learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine -warmup_steps: 0.1 +warmup_ratio: 0.1 fp16: true ### eval diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index 995d5f12b..7003cbeea 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -107,7 +107,7 @@ class ModelArguments: ) vllm_maxlen: int = field( default=2048, - metadata={"help": "Maximum sequence length of the vLLM engine (including prompt and output)."}, + metadata={"help": "Maximum sequence (prompt + response) length of the vLLM engine."}, ) vllm_gpu_util: float = field( default=0.9,