diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index 955c52291..80f2e187b 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -356,7 +356,7 @@ _register_composite_model( _register_composite_model( model_type="qwen3_vl", projector_key="visual.merger", - vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"], + vision_model_keys=["visual.pos_embed", "visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"], language_model_keys=["language_model", "lm_head"], lora_conflict_keys=["patch_embed"], ) @@ -365,7 +365,7 @@ _register_composite_model( _register_composite_model( model_type="qwen3_vl_moe", projector_key="visual.merger", - vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"], + vision_model_keys=["visual.pos_embed", "visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"], language_model_keys=["language_model", "lm_head"], lora_conflict_keys=["patch_embed"], ) @@ -374,7 +374,7 @@ _register_composite_model( _register_composite_model( model_type="qwen3_omni_moe_thinker", projector_key="visual.merger", - vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list", "audio_tower"], + vision_model_keys=["visual.pos_embed", "visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list", "audio_tower"], language_model_keys=["model", "lm_head"], lora_conflict_keys=["patch_embed"], )