tiny fix about badam

Former-commit-id: 03f49267c7406e36aee35639f86e6e0383897090
2024-06-25 01:54:53 +08:00
parent 98fb3d015a
commit 9fd7a410bb
15 changed files with 31 additions and 102 deletions
--- a/examples/extras/badam/llama3_full_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft.yaml
@@ -6,6 +6,7 @@ stage: sft
 do_train: true
 finetuning_type: full
 use_badam: true
+badam_mode: layer
 badam_switch_mode: ascending
 badam_switch_interval: 50
 badam_verbose: 2
@@ -32,7 +33,6 @@ learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-pure_bf16: true

 ### eval
 val_size: 0.1
--- a/examples/extras/badam/llama3_full_sft_ds3.yaml
+++ b/examples/extras/badam/llama3_full_sft_ds3.yaml
@@ -6,9 +6,11 @@ stage: sft
 do_train: true
 finetuning_type: full
 use_badam: true
+badam_mode: layer
 badam_switch_mode: ascending
 badam_switch_interval: 50
 badam_verbose: 2
+deepspeed: examples/deepspeed/ds_z3_config.json

 ### dataset
 dataset: identity,alpaca_en_demo
@@ -28,7 +30,7 @@ overwrite_output_dir: true
 ### train
 per_device_train_batch_size: 1
 gradient_accumulation_steps: 8
-learning_rate: 1.0e-6
+learning_rate: 1.0e-4
 num_train_epochs: 3.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
--- a/examples/extras/badam/train_single_gpu.sh
+++ b/examples/extras/badam/train_single_gpu.sh
@@ -1,37 +0,0 @@
-#!/bin/bash
-export CUDA_VISIBLE_DEVICES=0
-
-cd ../../..
-
-llamafactory-cli train \
-    --stage sft \
-    --do_train True \
-    --model_name_or_path meta-llama/Llama-2-13b-hf \
-    --preprocessing_num_workers 16 \
-    --finetuning_type full \
-    --template default \
-    --flash_attn auto \
-    --dataset_dir data \
-    --dataset alpaca_en_demo \
-    --cutoff_len 1024 \
-    --learning_rate 1e-6 \
-    --num_train_epochs 3.0 \
-    --max_samples 100000 \
-    --per_device_train_batch_size 1 \
-    --gradient_accumulation_steps 8 \
-    --lr_scheduler_type cosine \
-    --max_grad_norm 1.0 \
-    --logging_steps 5 \
-    --save_steps 100 \
-    --warmup_steps 0 \
-    --optim adamw_torch \
-    --packing False \
-    --report_to none \
-    --use_badam True \
-    --output_dir saves/LLaMA2-13B/full/BAdam \
-    --plot_loss True \
-    --ddp_timeout 180000000 \
-    --include_num_input_tokens_seen True \
-    --badam_mode layer \
-    --badam_switch_mode ascending \
-    --badam_switch_interval 50
--- a/examples/extras/badam/train_zero3.sh
+++ b/examples/extras/badam/train_zero3.sh
@@ -1,39 +0,0 @@
-#!/bin/bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-
-cd ../../..
-
-llamafactory-cli train \
-    --stage sft \
-    --do_train True \
-    --model_name_or_path meta-llama/Llama-2-13b-hf \
-    --preprocessing_num_workers 16 \
-    --finetuning_type full \
-    --template default \
-    --flash_attn auto \
-    --dataset_dir data \
-    --dataset alpaca_en_demo \
-    --cutoff_len 1024 \
-    --learning_rate 1e-6 \
-    --num_train_epochs 3.0 \
-    --max_samples 100000 \
-    --per_device_train_batch_size 8 \
-    --gradient_accumulation_steps 2 \
-    --lr_scheduler_type cosine \
-    --max_grad_norm 1.0 \
-    --logging_steps 5 \
-    --save_steps 100 \
-    --warmup_steps 0 \
-    --optim adamw_torch \
-    --packing False \
-    --report_to none \
-    --use_badam True \
-    --output_dir saves/LLaMA2-13B/full/BAdam \
-    --fp16 True \
-    --plot_loss True \
-    --ddp_timeout 180000000 \
-    --include_num_input_tokens_seen True \
-    --badam_mode layer \
-    --badam_switch_mode ascending \
-    --badam_switch_interval 50 \
-    --deepspeed cache/ds_z3_config.json