From bf04ca6af8c82bed0f9562d04c32a3b5851eaa06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B5=AE=E6=A2=A6?=
 <46097299+frozenleaves@users.noreply.github.com>
Date: Mon, 2 Feb 2026 12:07:19 +0800
Subject: [PATCH] [deps] adapt to transformers v5 (#10147)

Co-authored-by: frozenleaves <frozen@Mac.local>
Co-authored-by: hiyouga <hiyouga@buaa.edu.cn>
---
 examples/v1/train_full/train_full_fsdp2.yaml |  2 +-
 pyproject.toml                               |  4 +-
 requirements/deepspeed.txt                   |  2 +-
 src/llamafactory/extras/misc.py              |  4 +-
 src/llamafactory/hparams/data_args.py        |  4 +-
 src/llamafactory/hparams/model_args.py       |  3 -
 src/llamafactory/hparams/parser.py           |  6 --
 src/llamafactory/model/loader.py             |  5 +-
 src/llamafactory/model/model_utils/visual.py |  8 +-
 src/llamafactory/train/callbacks.py          | 10 ++-
 src/llamafactory/train/ppo/workflow.py       |  2 +-
 src/llamafactory/train/rm/trainer.py         |  2 +-
 src/llamafactory/train/rm/workflow.py        |  2 +-
 tests/conftest.py                            | 10 +--
 tests/data/processor/test_feedback.py        |  8 +-
 tests/data/processor/test_pairwise.py        | 19 +++--
 tests/data/processor/test_supervised.py      | 31 +++++++-
 tests/data/processor/test_unsupervised.py    | 14 +++-
 tests/data/test_collator.py                  |  4 +-
 tests/data/test_template.py                  | 84 ++++++++------------
 tests/model/model_utils/test_visual.py       | 14 ++--
 tests/version.txt                            |  2 +-
 tests_v1/core/utils/test_rendering.py        | 29 ++++---
 23 files changed, 149 insertions(+), 120 deletions(-)

diff --git a/examples/v1/train_full/train_full_fsdp2.yaml b/examples/v1/train_full/train_full_fsdp2.yaml
index 3bc5e70cc..dfad62022 100644
--- a/examples/v1/train_full/train_full_fsdp2.yaml
+++ b/examples/v1/train_full/train_full_fsdp2.yaml
@@ -18,7 +18,7 @@ init_config:
   name: init_on_meta
 
 ### data
-train_dataset: data/v1_sft_demo.yaml 
+train_dataset: data/v1_sft_demo.yaml
 
 ### training
 output_dir: outputs/test_fsdp2
diff --git a/pyproject.toml b/pyproject.toml
index 2af5dcc86..39dd0ad31 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,10 +40,10 @@ dependencies = [
     "torch>=2.4.0",
     "torchvision>=0.19.0",
     "torchaudio>=2.4.0",
-    "transformers>=4.51.0,<=4.57.1,!=4.52.0,!=4.57.0",
+    "transformers>=4.51.0,<=5.0.0,!=4.52.0,!=4.57.0",
     "datasets>=2.16.0,<=4.0.0",
     "accelerate>=1.3.0,<=1.11.0",
-    "peft>=0.14.0,<=0.17.1",
+    "peft>=0.18.0,<=0.18.1",
     "trl>=0.18.0,<=0.24.0",
     "torchdata>=0.10.0,<=0.11.0",
     # gui
diff --git a/requirements/deepspeed.txt b/requirements/deepspeed.txt
index fe9116909..2c16ba942 100644
--- a/requirements/deepspeed.txt
+++ b/requirements/deepspeed.txt
@@ -1 +1 @@
-deepspeed>=0.10.0,<=0.16.9
+deepspeed>=0.10.0,<=0.18.4
diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index 47de14bb1..5c077dd57 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -94,10 +94,10 @@ def check_version(requirement: str, mandatory: bool = False) -> None:
 
 def check_dependencies() -> None:
     r"""Check the version of the required packages."""
-    check_version("transformers>=4.51.0,<=4.57.1")
+    check_version("transformers>=4.51.0,<=5.0.0")
     check_version("datasets>=2.16.0,<=4.0.0")
     check_version("accelerate>=1.3.0,<=1.11.0")
-    check_version("peft>=0.14.0,<=0.17.1")
+    check_version("peft>=0.18.0,<=0.18.1")
     check_version("trl>=0.18.0,<=0.24.0")
 
 
diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py
index 49d75cf08..11ad513d2 100644
--- a/src/llamafactory/hparams/data_args.py
+++ b/src/llamafactory/hparams/data_args.py
@@ -65,7 +65,9 @@ class DataArguments:
     )
     mix_strategy: Literal["concat", "interleave_under", "interleave_over", "interleave_once"] = field(
         default="concat",
-        metadata={"help": "Strategy to use in dataset mixing (concat/interleave) (undersampling/oversampling/sampling w.o. replacement)."},
+        metadata={
+            "help": "Strategy to use in dataset mixing (concat/interleave) (undersampling/oversampling/sampling w.o. replacement)."
+        },
     )
     interleave_probs: str | None = field(
         default=None,
diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py
index a245428fe..2bfaa2734 100644
--- a/src/llamafactory/hparams/model_args.py
+++ b/src/llamafactory/hparams/model_args.py
@@ -206,9 +206,6 @@ class BaseModelArguments:
         if self.model_name_or_path is None:
             raise ValueError("Please provide `model_name_or_path`.")
 
-        if self.split_special_tokens and self.use_fast_tokenizer:
-            raise ValueError("`split_special_tokens` is only supported for slow tokenizers.")
-
         if self.adapter_name_or_path is not None:  # support merging multiple lora weights
             self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")]
 
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 5cb438919..81762635b 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -139,10 +139,6 @@ def _verify_model_args(
         if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
             raise ValueError("Quantized model only accepts a single adapter. Merge them first.")
 
-    if data_args.template == "yi" and model_args.use_fast_tokenizer:
-        logger.warning_rank0("We should use slow tokenizer for the Yi models. Change `use_fast_tokenizer` to False.")
-        model_args.use_fast_tokenizer = False
-
 
 def _check_extra_dependencies(
     model_args: "ModelArguments",
@@ -188,9 +184,7 @@ def _check_extra_dependencies(
 
     if training_args is not None:
         if training_args.deepspeed:
-            # pin deepspeed version < 0.17 because of https://github.com/deepspeedai/DeepSpeed/issues/7347
             check_version("deepspeed", mandatory=True)
-            check_version("deepspeed>=0.10.0,<=0.16.9")
 
         if training_args.predict_with_generate:
             check_version("jieba", mandatory=True)
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 88856492d..d838d2176 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -22,7 +22,6 @@ from transformers import (
     AutoModelForImageTextToText,
     AutoModelForSeq2SeqLM,
     AutoModelForTextToWaveform,
-    AutoModelForVision2Seq,
     AutoProcessor,
     AutoTokenizer,
 )
@@ -166,11 +165,9 @@ def load_model(
         else:
             if type(config) in AutoModelForImageTextToText._model_mapping.keys():  # image-text
                 load_class = AutoModelForImageTextToText
-            elif type(config) in AutoModelForVision2Seq._model_mapping.keys():  # image-text
-                load_class = AutoModelForVision2Seq
             elif type(config) in AutoModelForSeq2SeqLM._model_mapping.keys():  # audio-text
                 load_class = AutoModelForSeq2SeqLM
-            elif type(config) in AutoModelForTextToWaveform._model_mapping.keys():  # audio hack for qwen omni
+            elif type(config) in AutoModelForTextToWaveform._model_mapping.keys():  # audio-text for qwen omni
                 load_class = AutoModelForTextToWaveform
             else:
                 load_class = AutoModelForCausalLM
diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py
index 80f2e187b..0d23b6e23 100644
--- a/src/llamafactory/model/model_utils/visual.py
+++ b/src/llamafactory/model/model_utils/visual.py
@@ -374,7 +374,13 @@ _register_composite_model(
 _register_composite_model(
     model_type="qwen3_omni_moe_thinker",
     projector_key="visual.merger",
-    vision_model_keys=["visual.pos_embed", "visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list", "audio_tower"],
+    vision_model_keys=[
+        "visual.pos_embed",
+        "visual.patch_embed",
+        "visual.blocks",
+        "visual.deepstack_merger_list",
+        "audio_tower",
+    ],
     language_model_keys=["model", "lm_head"],
     lora_conflict_keys=["patch_embed"],
 )
diff --git a/src/llamafactory/train/callbacks.py b/src/llamafactory/train/callbacks.py
index 5619568ea..d164c0443 100644
--- a/src/llamafactory/train/callbacks.py
+++ b/src/llamafactory/train/callbacks.py
@@ -103,7 +103,9 @@ class FixValueHeadModelCallback(TrainerCallback):
         if args.should_save:
             output_dir = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
             fix_valuehead_checkpoint(
-                model=kwargs.pop("model"), output_dir=output_dir, safe_serialization=args.save_safetensors
+                model=kwargs.pop("model"),
+                output_dir=output_dir,
+                safe_serialization=getattr(args, "save_safetensors", True),
             )
 
 
@@ -137,7 +139,7 @@ class PissaConvertCallback(TrainerCallback):
             if isinstance(model, PeftModel):
                 init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
                 setattr(model.peft_config["default"], "init_lora_weights", True)
-                model.save_pretrained(pissa_init_dir, safe_serialization=args.save_safetensors)
+                model.save_pretrained(pissa_init_dir, safe_serialization=getattr(args, "save_safetensors", True))
                 setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
 
     @override
@@ -155,11 +157,11 @@ class PissaConvertCallback(TrainerCallback):
             if isinstance(model, PeftModel):
                 init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
                 setattr(model.peft_config["default"], "init_lora_weights", True)
-                model.save_pretrained(pissa_backup_dir, safe_serialization=args.save_safetensors)
+                model.save_pretrained(pissa_backup_dir, safe_serialization=getattr(args, "save_safetensors", True))
                 setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
                 model.save_pretrained(
                     pissa_convert_dir,
-                    safe_serialization=args.save_safetensors,
+                    safe_serialization=getattr(args, "save_safetensors", True),
                     path_initial_model_for_weight_conversion=pissa_init_dir,
                 )
                 model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py
index fa6629a25..282a2f683 100644
--- a/src/llamafactory/train/ppo/workflow.py
+++ b/src/llamafactory/train/ppo/workflow.py
@@ -72,7 +72,7 @@ def run_ppo(
         ppo_trainer.ppo_train(resume_from_checkpoint=training_args.resume_from_checkpoint)
         ppo_trainer.save_model()
         if training_args.should_save:
-            fix_valuehead_checkpoint(model, training_args.output_dir, training_args.save_safetensors)
+            fix_valuehead_checkpoint(model, training_args.output_dir, getattr(training_args, "save_safetensors", True))
 
         ppo_trainer.save_state()  # must be called after save_model to have a folder
         if ppo_trainer.is_world_process_zero() and finetuning_args.plot_loss:
diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py
index 9bc207793..f0384681b 100644
--- a/src/llamafactory/train/rm/trainer.py
+++ b/src/llamafactory/train/rm/trainer.py
@@ -114,7 +114,7 @@ class PairwiseTrainer(Trainer):
         if state_dict is None:
             state_dict = self.model.state_dict()
 
-        if self.args.save_safetensors:
+        if getattr(self.args, "save_safetensors", True):
             from collections import defaultdict
 
             ptrs = defaultdict(list)
diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py
index 89b2c95c6..326561c46 100644
--- a/src/llamafactory/train/rm/workflow.py
+++ b/src/llamafactory/train/rm/workflow.py
@@ -65,7 +65,7 @@ def run_rm(
         train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
         trainer.save_model()
         if training_args.should_save:
-            fix_valuehead_checkpoint(model, training_args.output_dir, training_args.save_safetensors)
+            fix_valuehead_checkpoint(model, training_args.output_dir, getattr(training_args, "save_safetensors", True))
 
         trainer.log_metrics("train", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)
diff --git a/tests/conftest.py b/tests/conftest.py
index 835a15980..976b48ccc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,7 +18,6 @@ Contains shared fixtures, pytest configuration, and custom markers.
 """
 
 import os
-import sys
 
 import pytest
 import torch
@@ -149,14 +148,7 @@ def _manage_distributed_env(request: FixtureRequest, monkeypatch: MonkeyPatch) -
             devices_str = ",".join(str(i) for i in range(required))
 
         monkeypatch.setenv(env_key, devices_str)
-
-        # add project root dir to path for mp run
-        project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-        if project_root not in sys.path:
-            sys.path.insert(0, project_root)
-
-        os.environ["PYTHONPATH"] = project_root + os.pathsep + os.environ.get("PYTHONPATH", "")
-
+        monkeypatch.syspath_prepend(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
     else:  # non-distributed test
         if old_value:
             visible_devices = [v for v in old_value.split(",") if v != ""]
diff --git a/tests/data/processor/test_feedback.py b/tests/data/processor/test_feedback.py
index bcd85424f..f2c69d08a 100644
--- a/tests/data/processor/test_feedback.py
+++ b/tests/data/processor/test_feedback.py
@@ -20,6 +20,7 @@ from datasets import load_dataset
 from transformers import AutoTokenizer
 
 from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.extras.packages import is_transformers_version_greater_than
 from llamafactory.train.test_utils import load_dataset_module
 
 
@@ -52,7 +53,12 @@ def test_feedback_data(num_samples: int):
     for index in indexes:
         messages = original_data["messages"][index]
         ref_input_ids = ref_tokenizer.apply_chat_template(messages)
-        prompt_len = len(ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True))
+        ref_prompt_ids = ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True)
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+            ref_prompt_ids = ref_prompt_ids["input_ids"]
+
+        prompt_len = len(ref_prompt_ids)
         ref_labels = [IGNORE_INDEX] * prompt_len + ref_input_ids[prompt_len:]
         assert train_dataset["input_ids"][index] == ref_input_ids
         assert train_dataset["labels"][index] == ref_labels
diff --git a/tests/data/processor/test_pairwise.py b/tests/data/processor/test_pairwise.py
index 6047afd02..17d560984 100644
--- a/tests/data/processor/test_pairwise.py
+++ b/tests/data/processor/test_pairwise.py
@@ -20,6 +20,7 @@ from datasets import load_dataset
 from transformers import AutoTokenizer
 
 from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.extras.packages import is_transformers_version_greater_than
 from llamafactory.train.test_utils import load_dataset_module
 
 
@@ -63,13 +64,21 @@ def test_pairwise_data(num_samples: int):
         rejected_messages = original_data["conversations"][index] + [original_data["rejected"][index]]
         chosen_messages = _convert_sharegpt_to_openai(chosen_messages)
         rejected_messages = _convert_sharegpt_to_openai(rejected_messages)
+
         ref_chosen_input_ids = ref_tokenizer.apply_chat_template(chosen_messages)
-        chosen_prompt_len = len(ref_tokenizer.apply_chat_template(chosen_messages[:-1], add_generation_prompt=True))
-        ref_chosen_labels = [IGNORE_INDEX] * chosen_prompt_len + ref_chosen_input_ids[chosen_prompt_len:]
+        ref_chosen_prompt_ids = ref_tokenizer.apply_chat_template(chosen_messages[:-1], add_generation_prompt=True)
         ref_rejected_input_ids = ref_tokenizer.apply_chat_template(rejected_messages)
-        rejected_prompt_len = len(
-            ref_tokenizer.apply_chat_template(rejected_messages[:-1], add_generation_prompt=True)
-        )
+        ref_rejected_prompt_ids = ref_tokenizer.apply_chat_template(rejected_messages[:-1], add_generation_prompt=True)
+
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_chosen_input_ids = ref_chosen_input_ids["input_ids"]
+            ref_rejected_input_ids = ref_rejected_input_ids["input_ids"]
+            ref_chosen_prompt_ids = ref_chosen_prompt_ids["input_ids"]
+            ref_rejected_prompt_ids = ref_rejected_prompt_ids["input_ids"]
+
+        chosen_prompt_len = len(ref_chosen_prompt_ids)
+        rejected_prompt_len = len(ref_rejected_prompt_ids)
+        ref_chosen_labels = [IGNORE_INDEX] * chosen_prompt_len + ref_chosen_input_ids[chosen_prompt_len:]
         ref_rejected_labels = [IGNORE_INDEX] * rejected_prompt_len + ref_rejected_input_ids[rejected_prompt_len:]
         assert train_dataset["chosen_input_ids"][index] == ref_chosen_input_ids
         assert train_dataset["chosen_labels"][index] == ref_chosen_labels
diff --git a/tests/data/processor/test_supervised.py b/tests/data/processor/test_supervised.py
index 0179c1a36..f515852e1 100644
--- a/tests/data/processor/test_supervised.py
+++ b/tests/data/processor/test_supervised.py
@@ -20,6 +20,7 @@ from datasets import load_dataset
 from transformers import AutoTokenizer
 
 from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.extras.packages import is_transformers_version_greater_than
 from llamafactory.train.test_utils import load_dataset_module
 
 
@@ -59,7 +60,16 @@ def test_supervised_single_turn(num_samples: int):
             {"role": "assistant", "content": original_data["output"][index]},
         ]
         ref_input_ids = ref_tokenizer.apply_chat_template(messages)
+        ref_prompt_ids = ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True)
+
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+            ref_prompt_ids = ref_prompt_ids["input_ids"]
+
+        prompt_len = len(ref_prompt_ids)
+        ref_label_ids = [IGNORE_INDEX] * prompt_len + ref_input_ids[prompt_len:]
         assert train_dataset["input_ids"][index] == ref_input_ids
+        assert train_dataset["labels"][index] == ref_label_ids
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
@@ -73,6 +83,10 @@ def test_supervised_multi_turn(num_samples: int):
     indexes = random.choices(range(len(original_data)), k=num_samples)
     for index in indexes:
         ref_input_ids = ref_tokenizer.apply_chat_template(original_data["messages"][index])
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+
+        # cannot test the label ids in multi-turn case
         assert train_dataset["input_ids"][index] == ref_input_ids
 
 
@@ -86,9 +100,12 @@ def test_supervised_train_on_prompt(num_samples: int):
     original_data = load_dataset(DEMO_DATA, name="system_chat", split="train")
     indexes = random.choices(range(len(original_data)), k=num_samples)
     for index in indexes:
-        ref_ids = ref_tokenizer.apply_chat_template(original_data["messages"][index])
-        assert train_dataset["input_ids"][index] == ref_ids
-        assert train_dataset["labels"][index] == ref_ids
+        ref_input_ids = ref_tokenizer.apply_chat_template(original_data["messages"][index])
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+
+        assert train_dataset["input_ids"][index] == ref_input_ids
+        assert train_dataset["labels"][index] == ref_input_ids
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
@@ -103,7 +120,13 @@ def test_supervised_mask_history(num_samples: int):
     for index in indexes:
         messages = original_data["messages"][index]
         ref_input_ids = ref_tokenizer.apply_chat_template(messages)
-        prompt_len = len(ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True))
+        ref_prompt_ids = ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True)
+
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+            ref_prompt_ids = ref_prompt_ids["input_ids"]
+
+        prompt_len = len(ref_prompt_ids)
         ref_label_ids = [IGNORE_INDEX] * prompt_len + ref_input_ids[prompt_len:]
         assert train_dataset["input_ids"][index] == ref_input_ids
         assert train_dataset["labels"][index] == ref_label_ids
diff --git a/tests/data/processor/test_unsupervised.py b/tests/data/processor/test_unsupervised.py
index 05f6cf9a0..2d06219fc 100644
--- a/tests/data/processor/test_unsupervised.py
+++ b/tests/data/processor/test_unsupervised.py
@@ -19,6 +19,7 @@ import pytest
 from datasets import load_dataset
 from transformers import AutoTokenizer
 
+from llamafactory.extras.packages import is_transformers_version_greater_than
 from llamafactory.train.test_utils import load_dataset_module
 
 
@@ -55,8 +56,13 @@ def test_unsupervised_data(num_samples: int):
     indexes = random.choices(range(len(original_data)), k=num_samples)
     for index in indexes:
         messages = original_data["messages"][index]
-        ref_ids = ref_tokenizer.apply_chat_template(messages)
-        ref_input_ids = ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True)
-        ref_labels = ref_ids[len(ref_input_ids) :]
-        assert train_dataset["input_ids"][index] == ref_input_ids
+        ref_input_ids = ref_tokenizer.apply_chat_template(messages)
+        ref_prompt_ids = ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True)
+
+        if is_transformers_version_greater_than("5.0.0"):
+            ref_input_ids = ref_input_ids["input_ids"]
+            ref_prompt_ids = ref_prompt_ids["input_ids"]
+
+        ref_labels = ref_input_ids[len(ref_prompt_ids) :]
+        assert train_dataset["input_ids"][index] == ref_prompt_ids
         assert train_dataset["labels"][index] == ref_labels
diff --git a/tests/data/test_collator.py b/tests/data/test_collator.py
index 888030d08..63370b1b6 100644
--- a/tests/data/test_collator.py
+++ b/tests/data/test_collator.py
@@ -17,7 +17,7 @@ import os
 import pytest
 import torch
 from PIL import Image
-from transformers import AutoConfig, AutoModelForVision2Seq
+from transformers import AutoConfig, AutoModelForImageTextToText
 
 from llamafactory.data import get_template_and_fix_tokenizer
 from llamafactory.data.collator import MultiModalDataCollatorForSeq2Seq, prepare_4d_attention_mask
@@ -82,7 +82,7 @@ def test_multimodal_collator():
     template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)
     config = AutoConfig.from_pretrained(model_args.model_name_or_path)
     with torch.device("meta"):
-        model = AutoModelForVision2Seq.from_config(config)
+        model = AutoModelForImageTextToText.from_config(config)
 
     data_collator = MultiModalDataCollatorForSeq2Seq(
         template=template,
diff --git a/tests/data/test_template.py b/tests/data/test_template.py
index 9f1018976..b9d9ab2d8 100644
--- a/tests/data/test_template.py
+++ b/tests/data/test_template.py
@@ -20,6 +20,7 @@ from transformers import AutoTokenizer
 
 from llamafactory.data import get_template_and_fix_tokenizer
 from llamafactory.data.template import parse_template
+from llamafactory.extras.packages import is_transformers_version_greater_than
 from llamafactory.hparams import DataArguments
 
 
@@ -65,7 +66,6 @@ def _check_template(
     template_name: str,
     prompt_str: str,
     answer_str: str,
-    use_fast: bool,
     messages: list[dict[str, str]] = MESSAGES,
 ) -> None:
     r"""Check template.
@@ -75,13 +75,15 @@ def _check_template(
         template_name: the template name.
         prompt_str: the string corresponding to the prompt part.
         answer_str: the string corresponding to the answer part.
-        use_fast: whether to use fast tokenizer.
         messages: the list of messages.
 
     """
-    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=use_fast, token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
     content_str = tokenizer.apply_chat_template(messages, tokenize=False)
     content_ids = tokenizer.apply_chat_template(messages, tokenize=True)
+    if is_transformers_version_greater_than("5.0.0"):
+        content_ids = content_ids["input_ids"]
+
     template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template=template_name))
     prompt_ids, answer_ids = template.encode_oneturn(tokenizer, messages)
     assert content_str == prompt_str + answer_str
@@ -90,9 +92,8 @@ def _check_template(
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
-@pytest.mark.parametrize("use_fast", [True, False])
-def test_encode_oneturn(use_fast: bool):
-    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3, use_fast=use_fast)
+def test_encode_oneturn():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
     template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template="llama3"))
     prompt_ids, answer_ids = template.encode_oneturn(tokenizer, MESSAGES)
     prompt_str = (
@@ -106,9 +107,8 @@ def test_encode_oneturn(use_fast: bool):
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
-@pytest.mark.parametrize("use_fast", [True, False])
-def test_encode_multiturn(use_fast: bool):
-    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3, use_fast=use_fast)
+def test_encode_multiturn():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
     template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template="llama3"))
     encoded_pairs = template.encode_multiturn(tokenizer, MESSAGES)
     prompt_str_1 = (
@@ -128,11 +128,10 @@ def test_encode_multiturn(use_fast: bool):
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
-@pytest.mark.parametrize("use_fast", [True, False])
 @pytest.mark.parametrize("cot_messages", [True, False])
 @pytest.mark.parametrize("enable_thinking", [True, False, None])
-def test_reasoning_encode_oneturn(use_fast: bool, cot_messages: bool, enable_thinking: bool):
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", use_fast=use_fast)
+def test_reasoning_encode_oneturn(cot_messages: bool, enable_thinking: bool):
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
     data_args = DataArguments(template="qwen3", enable_thinking=enable_thinking)
     template = get_template_and_fix_tokenizer(tokenizer, data_args)
     prompt_ids, answer_ids = template.encode_oneturn(tokenizer, MESSAGES_WITH_THOUGHT if cot_messages else MESSAGES)
@@ -155,11 +154,10 @@ def test_reasoning_encode_oneturn(use_fast: bool, cot_messages: bool, enable_thi
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
-@pytest.mark.parametrize("use_fast", [True, False])
 @pytest.mark.parametrize("cot_messages", [True, False])
 @pytest.mark.parametrize("enable_thinking", [True, False, None])
-def test_reasoning_encode_multiturn(use_fast: bool, cot_messages: bool, enable_thinking: bool):
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", use_fast=use_fast)
+def test_reasoning_encode_multiturn(cot_messages: bool, enable_thinking: bool):
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
     data_args = DataArguments(template="qwen3", enable_thinking=enable_thinking)
     template = get_template_and_fix_tokenizer(tokenizer, data_args)
     encoded_pairs = template.encode_multiturn(tokenizer, MESSAGES_WITH_THOUGHT if cot_messages else MESSAGES)
@@ -185,10 +183,9 @@ def test_reasoning_encode_multiturn(use_fast: bool, cot_messages: bool, enable_t
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
-@pytest.mark.parametrize("use_fast", [True, False])
-def test_jinja_template(use_fast: bool):
-    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3, use_fast=use_fast)
-    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3, use_fast=use_fast)
+def test_jinja_template():
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
     template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template="llama3"))
     tokenizer.chat_template = template._get_jinja_template(tokenizer)  # llama3 template no replace
     assert tokenizer.chat_template != ref_tokenizer.chat_template
@@ -222,8 +219,7 @@ def test_get_stop_token_ids():
 
 @pytest.mark.runs_on(["cpu", "mps"])
 @pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
-@pytest.mark.parametrize("use_fast", [True, False])
-def test_gemma_template(use_fast: bool):
+def test_gemma_template():
     prompt_str = (
         f"<bos><start_of_turn>user\n{MESSAGES[0]['content']}<end_of_turn>\n"
         f"<start_of_turn>model\n{MESSAGES[1]['content']}<end_of_turn>\n"
@@ -231,13 +227,12 @@ def test_gemma_template(use_fast: bool):
         "<start_of_turn>model\n"
     )
     answer_str = f"{MESSAGES[3]['content']}<end_of_turn>\n"
-    _check_template("google/gemma-3-4b-it", "gemma", prompt_str, answer_str, use_fast)
+    _check_template("google/gemma-3-4b-it", "gemma", prompt_str, answer_str)
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
 @pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
-@pytest.mark.parametrize("use_fast", [True, False])
-def test_gemma2_template(use_fast: bool):
+def test_gemma2_template():
     prompt_str = (
         f"<bos><start_of_turn>user\n{MESSAGES[0]['content']}<end_of_turn>\n"
         f"<start_of_turn>model\n{MESSAGES[1]['content']}<end_of_turn>\n"
@@ -245,13 +240,12 @@ def test_gemma2_template(use_fast: bool):
         "<start_of_turn>model\n"
     )
     answer_str = f"{MESSAGES[3]['content']}<end_of_turn>\n"
-    _check_template("google/gemma-2-2b-it", "gemma2", prompt_str, answer_str, use_fast)
+    _check_template("google/gemma-2-2b-it", "gemma2", prompt_str, answer_str)
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
 @pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
-@pytest.mark.parametrize("use_fast", [True, False])
-def test_llama3_template(use_fast: bool):
+def test_llama3_template():
     prompt_str = (
         f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{MESSAGES[0]['content']}<|eot_id|>"
         f"<|start_header_id|>assistant<|end_header_id|>\n\n{MESSAGES[1]['content']}<|eot_id|>"
@@ -259,14 +253,11 @@ def test_llama3_template(use_fast: bool):
         "<|start_header_id|>assistant<|end_header_id|>\n\n"
     )
     answer_str = f"{MESSAGES[3]['content']}<|eot_id|>"
-    _check_template("meta-llama/Meta-Llama-3-8B-Instruct", "llama3", prompt_str, answer_str, use_fast)
+    _check_template("meta-llama/Meta-Llama-3-8B-Instruct", "llama3", prompt_str, answer_str)
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
-@pytest.mark.parametrize(
-    "use_fast", [True, pytest.param(False, marks=pytest.mark.xfail(reason="Llama 4 has no slow tokenizer."))]
-)
-def test_llama4_template(use_fast: bool):
+def test_llama4_template():
     prompt_str = (
         f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{MESSAGES[0]['content']}<|eot|>"
         f"<|header_start|>assistant<|header_end|>\n\n{MESSAGES[1]['content']}<|eot|>"
@@ -274,18 +265,11 @@ def test_llama4_template(use_fast: bool):
         "<|header_start|>assistant<|header_end|>\n\n"
     )
     answer_str = f"{MESSAGES[3]['content']}<|eot|>"
-    _check_template(TINY_LLAMA4, "llama4", prompt_str, answer_str, use_fast)
+    _check_template(TINY_LLAMA4, "llama4", prompt_str, answer_str)
 
 
-@pytest.mark.parametrize(
-    "use_fast",
-    [
-        pytest.param(True, marks=pytest.mark.xfail(not HF_TOKEN, reason="Authorization.")),
-        pytest.param(False, marks=pytest.mark.xfail(reason="Phi-4 slow tokenizer is broken.")),
-    ],
-)
 @pytest.mark.runs_on(["cpu", "mps"])
-def test_phi4_template(use_fast: bool):
+def test_phi4_template():
     prompt_str = (
         f"<|im_start|>user<|im_sep|>{MESSAGES[0]['content']}<|im_end|>"
         f"<|im_start|>assistant<|im_sep|>{MESSAGES[1]['content']}<|im_end|>"
@@ -293,13 +277,12 @@ def test_phi4_template(use_fast: bool):
         "<|im_start|>assistant<|im_sep|>"
     )
     answer_str = f"{MESSAGES[3]['content']}<|im_end|>"
-    _check_template("microsoft/phi-4", "phi4", prompt_str, answer_str, use_fast)
+    _check_template("microsoft/phi-4", "phi4", prompt_str, answer_str)
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
 @pytest.mark.xfail(not HF_TOKEN, reason="Authorization.")
-@pytest.mark.parametrize("use_fast", [True, False])
-def test_qwen2_5_template(use_fast: bool):
+def test_qwen2_5_template():
     prompt_str = (
         "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n"
         f"<|im_start|>user\n{MESSAGES[0]['content']}<|im_end|>\n"
@@ -308,13 +291,12 @@ def test_qwen2_5_template(use_fast: bool):
         "<|im_start|>assistant\n"
     )
     answer_str = f"{MESSAGES[3]['content']}<|im_end|>\n"
-    _check_template("Qwen/Qwen2.5-7B-Instruct", "qwen", prompt_str, answer_str, use_fast)
+    _check_template("Qwen/Qwen2.5-7B-Instruct", "qwen", prompt_str, answer_str)
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
-@pytest.mark.parametrize("use_fast", [True, False])
 @pytest.mark.parametrize("cot_messages", [True, False])
-def test_qwen3_template(use_fast: bool, cot_messages: bool):
+def test_qwen3_template(cot_messages: bool):
     prompt_str = (
         f"<|im_start|>user\n{MESSAGES[0]['content']}<|im_end|>\n"
         f"<|im_start|>assistant\n{MESSAGES[1]['content']}<|im_end|>\n"
@@ -328,12 +310,12 @@ def test_qwen3_template(use_fast: bool, cot_messages: bool):
         answer_str = f"{MESSAGES_WITH_THOUGHT[3]['content']}<|im_end|>\n"
         messages = MESSAGES_WITH_THOUGHT
 
-    _check_template("Qwen/Qwen3-8B", "qwen3", prompt_str, answer_str, use_fast, messages=messages)
+    _check_template("Qwen/Qwen3-8B", "qwen3", prompt_str, answer_str, messages=messages)
 
 
 @pytest.mark.runs_on(["cpu", "mps"])
 def test_parse_llama3_template():
-    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3, token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
     template = parse_template(tokenizer)
     assert template.format_user.slots == [
         "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
@@ -348,7 +330,7 @@ def test_parse_llama3_template():
 @pytest.mark.runs_on(["cpu", "mps"])
 @pytest.mark.xfail(not HF_TOKEN, reason="Authorization.")
 def test_parse_qwen_template():
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
     template = parse_template(tokenizer)
     assert template.__class__.__name__ == "Template"
     assert template.format_user.slots == ["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
@@ -361,7 +343,7 @@ def test_parse_qwen_template():
 @pytest.mark.runs_on(["cpu", "mps"])
 @pytest.mark.xfail(not HF_TOKEN, reason="Authorization.")
 def test_parse_qwen3_template():
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", token=HF_TOKEN)
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
     template = parse_template(tokenizer)
     assert template.__class__.__name__ == "ReasoningTemplate"
     assert template.format_user.slots == ["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]
diff --git a/tests/model/model_utils/test_visual.py b/tests/model/model_utils/test_visual.py
index fc53b69c2..b19575722 100644
--- a/tests/model/model_utils/test_visual.py
+++ b/tests/model/model_utils/test_visual.py
@@ -16,7 +16,8 @@ import os
 
 import pytest
 import torch
-from transformers import AutoConfig, AutoModelForVision2Seq
+from safetensors.torch import load_file
+from transformers import AutoConfig, AutoModelForImageTextToText
 
 from llamafactory.extras.packages import is_transformers_version_greater_than
 from llamafactory.hparams import FinetuningArguments, ModelArguments
@@ -36,7 +37,7 @@ def test_visual_full(freeze_vision_tower: bool, freeze_multi_modal_projector: bo
     )
     config = AutoConfig.from_pretrained(model_args.model_name_or_path)
     with torch.device("meta"):
-        model = AutoModelForVision2Seq.from_config(config)
+        model = AutoModelForImageTextToText.from_config(config)
 
     model = init_adapter(config, model, model_args, finetuning_args, is_trainable=True)
     for name, param in model.named_parameters():
@@ -56,7 +57,7 @@ def test_visual_lora(freeze_vision_tower: bool, freeze_language_model: bool):
     )
     config = AutoConfig.from_pretrained(model_args.model_name_or_path)
     with torch.device("meta"):
-        model = AutoModelForVision2Seq.from_config(config)
+        model = AutoModelForImageTextToText.from_config(config)
 
     model = init_adapter(config, model, model_args, finetuning_args, is_trainable=True)
     trainable_params, frozen_params = set(), set()
@@ -86,13 +87,14 @@ def test_visual_model_save_load():
     finetuning_args = FinetuningArguments(finetuning_type="full")
     config = AutoConfig.from_pretrained(model_args.model_name_or_path)
     with torch.device("meta"):
-        model = AutoModelForVision2Seq.from_config(config)
+        model = AutoModelForImageTextToText.from_config(config)
 
     model = init_adapter(config, model, model_args, finetuning_args, is_trainable=False)
+    model.to_empty(device="cpu")
     loaded_model_weight = dict(model.named_parameters())
 
-    model.save_pretrained(os.path.join("output", "qwen2_vl"), max_shard_size="10GB", safe_serialization=False)
-    saved_model_weight = torch.load(os.path.join("output", "qwen2_vl", "pytorch_model.bin"), weights_only=False)
+    model.save_pretrained(os.path.join("output", "qwen2_vl"), max_shard_size="10GB", safe_serialization=True)
+    saved_model_weight = load_file(os.path.join("output", "qwen2_vl", "model.safetensors"))
 
     if is_transformers_version_greater_than("4.52.0"):
         assert "model.language_model.layers.0.self_attn.q_proj.weight" in loaded_model_weight
diff --git a/tests/version.txt b/tests/version.txt
index a1f9032b0..fdd7d35a4 100644
--- a/tests/version.txt
+++ b/tests/version.txt
@@ -1,2 +1,2 @@
 # change if test fails or cache is outdated
-0.9.5.105
+0.9.5.106
diff --git a/tests_v1/core/utils/test_rendering.py b/tests_v1/core/utils/test_rendering.py
index f3e5f83c6..7e4797805 100644
--- a/tests_v1/core/utils/test_rendering.py
+++ b/tests_v1/core/utils/test_rendering.py
@@ -23,6 +23,13 @@ from llamafactory.v1.core.utils.rendering import Renderer
 from llamafactory.v1.utils.types import Processor
 
 
+def _get_input_ids(inputs: list | dict) -> list:
+    if not isinstance(inputs, list):
+        return inputs["input_ids"]
+    else:
+        return inputs
+
+
 HF_MESSAGES = [
     {"role": "system", "content": "You are a helpful assistant."},
     {"role": "user", "content": "What is LLM?"},
@@ -81,15 +88,15 @@ def test_chatml_rendering():
     tokenizer: Processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen3")
     renderer = Renderer(template="chatml", processor=tokenizer)
 
-    hf_inputs = tokenizer.apply_chat_template(HF_MESSAGES[:-1], add_generation_prompt=True)
+    hf_inputs = _get_input_ids(tokenizer.apply_chat_template(HF_MESSAGES[:-1], add_generation_prompt=True))
     v1_inputs = renderer.render_messages(V1_MESSAGES[:-1], is_generate=True)
     assert v1_inputs["input_ids"] == hf_inputs
     assert v1_inputs["attention_mask"] == [1] * len(hf_inputs)
     assert v1_inputs["labels"] == [-100] * len(hf_inputs)
     assert v1_inputs["loss_weights"] == [0.0] * len(hf_inputs)
 
-    hf_inputs_part = tokenizer.apply_chat_template(HF_MESSAGES[:-1], add_generation_prompt=False)
-    hf_inputs_full = tokenizer.apply_chat_template(HF_MESSAGES, add_generation_prompt=False)
+    hf_inputs_part = _get_input_ids(tokenizer.apply_chat_template(HF_MESSAGES[:-1], add_generation_prompt=False))
+    hf_inputs_full = _get_input_ids(tokenizer.apply_chat_template(HF_MESSAGES, add_generation_prompt=False))
     v1_inputs_full = renderer.render_messages(V1_MESSAGES, is_generate=False)
     assert v1_inputs_full["input_ids"] == hf_inputs_full
     assert v1_inputs_full["attention_mask"] == [1] * len(hf_inputs_full)
@@ -124,17 +131,21 @@ def test_qwen3_nothink_rendering():
     tokenizer: Processor = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507")
     renderer = Renderer(template="qwen3_nothink", processor=tokenizer)
 
-    hf_inputs = tokenizer.apply_chat_template(HF_MESSAGES_WITH_TOOLS[:-1], tools=V1_TOOLS, add_generation_prompt=True)
+    hf_inputs = _get_input_ids(
+        tokenizer.apply_chat_template(HF_MESSAGES_WITH_TOOLS[:-1], tools=V1_TOOLS, add_generation_prompt=True)
+    )
     v1_inputs = renderer.render_messages(V1_MESSAGES_WITH_TOOLS[:-1], tools=json.dumps(V1_TOOLS), is_generate=True)
     assert v1_inputs["input_ids"] == hf_inputs
     assert v1_inputs["attention_mask"] == [1] * len(hf_inputs)
     assert v1_inputs["labels"] == [-100] * len(hf_inputs)
     assert v1_inputs["loss_weights"] == [0.0] * len(hf_inputs)
 
-    hf_inputs_part = tokenizer.apply_chat_template(
-        HF_MESSAGES_WITH_TOOLS[:-1], tools=V1_TOOLS, add_generation_prompt=False
+    hf_inputs_part = _get_input_ids(
+        tokenizer.apply_chat_template(HF_MESSAGES_WITH_TOOLS[:-1], tools=V1_TOOLS, add_generation_prompt=False)
+    )
+    hf_inputs_full = _get_input_ids(
+        tokenizer.apply_chat_template(HF_MESSAGES_WITH_TOOLS, tools=V1_TOOLS, add_generation_prompt=False)
     )
-    hf_inputs_full = tokenizer.apply_chat_template(HF_MESSAGES_WITH_TOOLS, tools=V1_TOOLS, add_generation_prompt=False)
     v1_inputs_full = renderer.render_messages(V1_MESSAGES_WITH_TOOLS, tools=json.dumps(V1_TOOLS), is_generate=False)
     assert v1_inputs_full["input_ids"] == hf_inputs_full
     assert v1_inputs_full["attention_mask"] == [1] * len(hf_inputs_full)
@@ -187,7 +198,7 @@ def test_qwen3_nothink_rendering_remote(num_samples: int):
 def test_process_sft_samples():
     tokenizer: Processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen3")
     renderer = Renderer(template="chatml", processor=tokenizer)
-    hf_inputs = tokenizer.apply_chat_template(HF_MESSAGES)
+    hf_inputs = _get_input_ids(tokenizer.apply_chat_template(HF_MESSAGES))
 
     samples = [{"messages": V1_MESSAGES, "extra_info": "test", "_dataset_name": "default"}]
     model_inputs = renderer.process_samples(samples)
@@ -200,7 +211,7 @@ def test_process_sft_samples():
 def test_process_dpo_samples():
     tokenizer: Processor = AutoTokenizer.from_pretrained("llamafactory/tiny-random-qwen3")
     renderer = Renderer(template="chatml", processor=tokenizer)
-    hf_inputs = tokenizer.apply_chat_template(HF_MESSAGES)
+    hf_inputs = _get_input_ids(tokenizer.apply_chat_template(HF_MESSAGES))
 
     samples = [
         {