fix mod stuff

Former-commit-id: cf3988226e6398c67bb2955578e436fc505aa5c5
2024-04-21 18:11:10 +08:00
parent 3365cc8cf0
commit f8e219dc81
16 changed files with 63 additions and 88 deletions
--- a/examples/extras/MoD/freeze_sft.sh
+++ b/examples/extras/MoD/freeze_sft.sh
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
-    --dataset alpaca_gpt4_en,glaive_toolcall \
-    --dataset_dir ../../../data \
-    --template default \
-    --finetuning_type freeze \
-    --name_module_trainable router \
-    --output_dir ../../../saves/TinyLlama/TinyLlama-1.1B-Chat-v1.0/sft \
-    --mixture_of_depths convert \
-    --overwrite_cache \
-    --overwrite_output_dir \
-    --cutoff_len 1024 \
-    --preprocessing_num_workers 16 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --warmup_steps 20 \
-    --save_steps 100 \
-    --eval_steps 100 \
-    --evaluation_strategy steps \
-    --load_best_model_at_end \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
-    --val_size 0.1 \
-    --plot_loss \
-    --pure_bf16
--- a/examples/extras/MoD/sft.sh
+++ b/examples/extras/MoD/sft.sh
@@ -3,20 +3,21 @@
 CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
    --stage sft \
    --do_train \
-    --model_name_or_path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
    --dataset alpaca_gpt4_en,glaive_toolcall \
    --dataset_dir ../../../data \
    --template default \
    --finetuning_type full \
-    --output_dir ../../../saves/TinyLlama/TinyLlama-1.1B-Chat-v1.0/sft \
    --mixture_of_depths convert \
+    --output_dir ../../../saves/LLaMA2-7B/mod/sft \
    --overwrite_cache \
    --overwrite_output_dir \
    --cutoff_len 1024 \
    --preprocessing_num_workers 16 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 1 \
+    --gradient_accumulation_steps 8 \
+    --optim paged_adamw_8bit \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --warmup_steps 20 \
--- a/examples/extras/galore/sft.sh
+++ b/examples/extras/galore/sft.sh
@@ -11,6 +11,7 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
    --use_galore \
    --galore_layerwise \
    --galore_target mlp,self_attn \
+    --galore_scale 2.0 \
    --galore_rank 128 \
    --output_dir ../../../saves/LLaMA2-7B/galore/sft \
    --overwrite_cache \
@@ -28,8 +29,8 @@ CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \
    --evaluation_strategy steps \
    --load_best_model_at_end \
    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --max_samples 3000 \
+    --num_train_epochs 30.0 \
+    --max_samples 300 \
    --val_size 0.1 \
    --plot_loss \
    --pure_bf16