[feature] adding orthogononal finetuning (OFT) to llama factory (#8623)

Co-authored-by: Zeju <zqiu@g003.internal.cluster.is.localnet> Co-authored-by: Zeju <zqiu@login2.is.localnet> Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>
2025-08-18 12:22:47 +02:00
parent 1ada15981a
commit 003a2acb1a
13 changed files with 375 additions and 47 deletions
--- a/examples/README.md
+++ b/examples/README.md
@@ -290,3 +290,15 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/train.sh
 ```
+
+#### OFT Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/oft/llama3_oft_sft.yaml
+```
+
+#### QOFT Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/qoft/llama3_oft_sft_bnb_npu.yaml
+```
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -290,3 +290,15 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/train.sh
 ```
+
+#### OFT 微调
+
+```bash
+llamafactory-cli train examples/extras/oft/llama3_oft_sft.yaml
+```
+
+#### QOFT 微调
+
+```bash
+llamafactory-cli train examples/extras/qoft/llama3_oft_sft_bnb_npu.yaml
+```
--- a/examples/extras/oft/llama3_oft_sft.yaml
+++ b/examples/extras/oft/llama3_oft_sft.yaml
@@ -0,0 +1,46 @@
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: oft
+oft_block_size: 32
+oft_target: all
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/llama3-8b/oft/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+
+### eval
+# eval_dataset: alpaca_en_demo
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/oft/qwen2_5vl_oft_sft.yaml
+++ b/examples/extras/oft/qwen2_5vl_oft_sft.yaml
@@ -0,0 +1,47 @@
+### model
+model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
+image_max_pixels: 262144
+video_max_pixels: 16384
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: oft
+oft_block_size: 32
+oft_target: all
+
+### dataset
+dataset: mllm_demo,identity,alpaca_en_demo  # video: mllm_video_demo
+template: qwen2_vl
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/qwen2_5vl-7b/oft/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/qoft/llama3_oft_sft_awq.yaml
+++ b/examples/extras/qoft/llama3_oft_sft_awq.yaml
@@ -0,0 +1,44 @@
+### model
+model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: oft
+oft_block_size: 32
+oft_target: all
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/llama3-8b/oft/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/qoft/llama3_oft_sft_bnb_npu.yaml
+++ b/examples/extras/qoft/llama3_oft_sft_bnb_npu.yaml
@@ -0,0 +1,47 @@
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+quantization_bit: 4
+quantization_method: bnb
+double_quantization: false
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: oft
+oft_block_size: 32
+oft_target: all
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/llama3-8b/oft/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/qoft/llama3_oft_sft_gptq.yaml
+++ b/examples/extras/qoft/llama3_oft_sft_gptq.yaml
@@ -0,0 +1,44 @@
+### model
+model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: oft
+oft_block_size: 32
+oft_target: all
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+
+### output
+output_dir: saves/llama3-8b/oft/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500