update trainers

Former-commit-id: d0dd6eefed0b86895ed00a7cafb331e5193db645
2024-03-28 18:16:27 +08:00
parent dc540dfaa8
commit 59e6ebf039
13 changed files with 89 additions and 145 deletions
--- a/examples/extras/galore/adamw.sh
+++ b/examples/extras/galore/adamw.sh
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --fp16
--- a/examples/extras/galore/adamw_8bit_bf16.sh
+++ b/examples/extras/galore/adamw_8bit_bf16.sh
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --optim adamw_8bit \
-    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
--- a/examples/extras/galore/galore_adamw_8bit_bf16.sh
+++ b/examples/extras/galore/galore_adamw_8bit_bf16.sh
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path meta-llama/Llama-2-7b-hf \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type full \
-    --optim adamw_8bit \
-    --use_galore \
-    --galore_layerwise \
-    --galore_target mlp,self_attn \
-    --galore_rank 128 \
-    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
--- a/examples/extras/galore/galore_adamw.sh
+++ b/examples/extras/galore/galore_adamw.sh
@@ -32,4 +32,4 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
    --max_samples 3000 \
    --val_size 0.1 \
    --plot_loss \
-    --fp16
+    --pure_bf16