mirror of
https://github.com/hiyouga/LlamaFactory.git
synced 2026-02-03 08:53:38 +00:00
[example] Add KTransformers Qwen3MoE example (#9511)
Co-authored-by: unknown <xiongchenhui@hisense.ad> Co-authored-by: Kingsley <kingsleydodonow@gmail.com>
This commit is contained in:
10
examples/inference/qwen3moe_lora_sft_kt.yaml
Normal file
10
examples/inference/qwen3moe_lora_sft_kt.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
model_name_or_path: Qwen/Qwen3-235B-A22B-Instruct-2507
|
||||||
|
adapter_name_or_path: saves/Kllama_Qwen3MoE_235bA22b
|
||||||
|
template: qwen3_nothink
|
||||||
|
infer_backend: ktransformers # choices: [huggingface, vllm, sglang, ktransformers]
|
||||||
|
trust_remote_code: true
|
||||||
|
|
||||||
|
use_kt: true # use KTransformers as LoRA sft backend to inference
|
||||||
|
kt_optimize_rule: examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
|
||||||
|
cpu_infer: 32
|
||||||
|
chunk_size: 8192
|
||||||
80
examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
Normal file
80
examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
- match:
|
||||||
|
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.RoPE.RotaryEmbedding
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
name: "^lm_head$" # regular expression
|
||||||
|
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
generate_op: "KLinearTorch"
|
||||||
|
prefill_op: "KLinearTorch"
|
||||||
|
|
||||||
|
# - match:
|
||||||
|
# name: "^model\\.layers\\..*$" # regular expression
|
||||||
|
# class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
# replace:
|
||||||
|
# class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
# kwargs:
|
||||||
|
# generate_device: "cuda"
|
||||||
|
# prefill_device: "cuda"
|
||||||
|
# generate_op: "KLinearTorch"
|
||||||
|
# prefill_op: "KLinearTorch"
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\.(?!.*mlp\\.shared_expert_gate).*$" # regular expression
|
||||||
|
class: torch.nn.Linear # only match modules matching name and class simultaneously
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
generate_op: "KLinearTorch"
|
||||||
|
prefill_op: "KLinearTorch"
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.mlp$"
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.experts.KQwen3MoeSparseMoeBlock # mlp module with custom forward function
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.mlp\\.experts$"
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
|
||||||
|
kwargs:
|
||||||
|
prefill_device: "cuda"
|
||||||
|
prefill_op: "KExpertsTorch"
|
||||||
|
generate_device: "cpu"
|
||||||
|
generate_op: "KSFTExpertsCPU"
|
||||||
|
out_device: "cuda"
|
||||||
|
backend: "AMXInt8" # or "AMXBF16" or "AMXInt8"
|
||||||
|
recursive: False # don't recursively inject submodules of this module
|
||||||
|
- match:
|
||||||
|
name: "^model\\.layers\\..*\\.self_attn$"
|
||||||
|
replace:
|
||||||
|
class: ktransformers.operators.attention.KQwen3MoeAttention # optimized MLA implementation
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cuda"
|
||||||
|
prefill_device: "cuda"
|
||||||
|
- match:
|
||||||
|
name: "^model.embed_tokens"
|
||||||
|
replace:
|
||||||
|
class: "default"
|
||||||
|
kwargs:
|
||||||
|
generate_device: "cpu"
|
||||||
|
prefill_device: "cpu"
|
||||||
|
|
||||||
|
- match:
|
||||||
|
name: "^model$"
|
||||||
|
replace:
|
||||||
|
class: "ktransformers.operators.models.KQwen3MoeModel"
|
||||||
|
kwargs:
|
||||||
|
per_layer_prefill_intput_threshold: 0
|
||||||
52
examples/train_lora/qwen3moe_lora_sft_kt.yaml
Normal file
52
examples/train_lora/qwen3moe_lora_sft_kt.yaml
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
### model
|
||||||
|
model_name_or_path: Qwen/Qwen3-235B-A22B-Instruct-2507
|
||||||
|
trust_remote_code: true
|
||||||
|
|
||||||
|
### method
|
||||||
|
stage: sft
|
||||||
|
do_train: true
|
||||||
|
finetuning_type: lora
|
||||||
|
lora_rank: 8
|
||||||
|
lora_target: all
|
||||||
|
|
||||||
|
### dataset
|
||||||
|
dataset: identity, alpaca_en_demo
|
||||||
|
template: qwen3_nothink
|
||||||
|
cutoff_len: 2048
|
||||||
|
max_samples: 100000
|
||||||
|
overwrite_cache: true
|
||||||
|
preprocessing_num_workers: 16
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
|
||||||
|
### output
|
||||||
|
output_dir: saves/Kllama_Qwen3MoE_235bA22b
|
||||||
|
logging_steps: 10
|
||||||
|
save_steps: 200
|
||||||
|
plot_loss: true
|
||||||
|
overwrite_output_dir: true
|
||||||
|
save_only_model: false
|
||||||
|
report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
|
||||||
|
|
||||||
|
### train
|
||||||
|
per_device_train_batch_size: 1
|
||||||
|
gradient_accumulation_steps: 8
|
||||||
|
learning_rate: 1.0e-4
|
||||||
|
num_train_epochs: 3
|
||||||
|
lr_scheduler_type: cosine
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
bf16: true
|
||||||
|
ddp_timeout: 180000000
|
||||||
|
resume_from_checkpoint: null
|
||||||
|
|
||||||
|
### ktransformers
|
||||||
|
use_kt: true # use KTransformers as LoRA sft backend
|
||||||
|
kt_optimize_rule: examples/kt_optimize_rules/Qwen3Moe-sft-amx.yaml
|
||||||
|
cpu_infer: 32
|
||||||
|
chunk_size: 8192
|
||||||
|
|
||||||
|
### eval
|
||||||
|
# eval_dataset: alpaca_en_demo
|
||||||
|
# val_size: 0.1
|
||||||
|
# per_device_eval_batch_size: 1
|
||||||
|
# eval_strategy: steps
|
||||||
|
# eval_steps: 500
|
||||||
Reference in New Issue
Block a user