[optim] clean apollo (#6645)

* clean apollo code * update readme Former-commit-id: 38b8ec4a99189483124b54df9d6bc6b0d318855a
2025-01-15 01:42:50 +08:00
parent c2120432db
commit 7638f1070e
14 changed files with 110 additions and 103 deletions
--- a/examples/extras/apollo/llama3_full_sft.yaml
+++ b/examples/extras/apollo/llama3_full_sft.yaml
@@ -7,8 +7,8 @@ stage: sft
 do_train: true
 finetuning_type: full
 use_apollo: true
-apollo_layerwise: true
-apollo_target: mlp,self_attn
+apollo_layerwise: true  # choices: [true, false], use false for DDP training
+apollo_target: all
 apollo_rank: 128
 apollo_scale: 32.0
 apollo_scale_type: channel
@@ -22,7 +22,7 @@ overwrite_cache: true
 preprocessing_num_workers: 16

 ### output
-output_dir: saves/llama3-8b/apollo_full-scale32/sft
+output_dir: saves/llama3-8b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true
@@ -30,7 +30,7 @@ overwrite_output_dir: true

 ### train
 per_device_train_batch_size: 1
-gradient_accumulation_steps: 1
+gradient_accumulation_steps: 1  # use 1 for layerwise apollo
 learning_rate: 1.0e-5
 num_train_epochs: 3.0
 lr_scheduler_type: cosine