diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d64f74658..2dcf26141 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -35,15 +35,12 @@ jobs: transformers: - "" include: # test backward compatibility - - python: "3.11" - os: "ubuntu-latest" - transformers: "4.51.0" - - python: "3.11" - os: "ubuntu-latest" - transformers: "4.53.0" - python: "3.11" os: "ubuntu-latest" transformers: "4.55.0" + - python: "3.11" + os: "ubuntu-latest" + transformers: "4.57.1" runs-on: ${{ matrix.os }} diff --git a/pyproject.toml b/pyproject.toml index d14019011..6ae7e3ffe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ "torch>=2.4.0", "torchvision>=0.19.0", "torchaudio>=2.4.0", - "transformers>=4.51.0,<=5.2.0,!=4.52.0,!=4.57.0", + "transformers>=4.55.0,<=5.2.0,!=4.52.0,!=4.57.0", "datasets>=2.16.0,<=4.0.0", "accelerate>=1.3.0,<=1.11.0", "peft>=0.18.0,<=0.18.1", diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py index 31809eb24..a54970d31 100644 --- a/src/llamafactory/data/collator.py +++ b/src/llamafactory/data/collator.py @@ -15,6 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import inspect from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Literal, Optional @@ -25,7 +26,7 @@ import torch.nn.functional as F from peft import PeftModel from transformers import DataCollatorForSeq2Seq -from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER +from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER, MROPE_MODELS from ..extras.packages import is_pillow_available @@ -39,6 +40,56 @@ if TYPE_CHECKING: from .template import Template +def _slice_mm_inputs_for_sample( + mm_inputs: dict[str, Any], + batch_imglens: list[int], + batch_vidlens: list[int], + batch_idx: int, + images_per_subseq: Optional[list[int]] = None, + videos_per_subseq: Optional[list[int]] = None, + subseq_idx: Optional[int] = None, +) -> dict[str, Any]: + r"""Slice mm_inputs for one batch sample, optionally for a single sub-sequence when packing. + + image_grid_thw / video_grid_thw have shape [num_items, 3]. Indices for sample batch_idx + are batch_imglens[batch_idx] images and batch_vidlens[batch_idx] videos. When subseq_idx + is given, further restrict to that sub-seq's counts via packed_*_counts. + has_dummy_image=True means only batch[0] will be concated with fake image and no multimodal data. + """ + image_start_idx = sum(batch_imglens[:batch_idx]) + image_end_idx = sum(batch_imglens[: batch_idx + 1]) + video_start_idx = sum(batch_vidlens[:batch_idx]) + video_end_idx = sum(batch_vidlens[: batch_idx + 1]) + + if subseq_idx is not None and images_per_subseq is not None: + image_start_idx += sum(images_per_subseq[:subseq_idx]) + image_end_idx = image_start_idx + images_per_subseq[subseq_idx] + + if subseq_idx is not None and videos_per_subseq is not None: + video_start_idx += sum(videos_per_subseq[:subseq_idx]) + video_end_idx = video_start_idx + videos_per_subseq[subseq_idx] + + sliced_mm_inputs: dict[str, Any] = {} + key_to_slice_meta = { + "image_grid_thw": (image_start_idx, image_end_idx, True), + "video_grid_thw": (video_start_idx, video_end_idx, True), + "second_per_grid_ts": (video_start_idx, video_end_idx, False), # qwen2.5vl + "video_second_per_grid": (video_start_idx, video_end_idx, False), # qwen omni + } + + for key, (start_idx, end_idx, assign_none_when_empty) in key_to_slice_meta.items(): + if key not in mm_inputs: + continue + + mm_value = mm_inputs[key] + if mm_value is not None and end_idx > start_idx: + sliced_mm_inputs[key] = mm_value[start_idx:end_idx] + elif assign_none_when_empty: + sliced_mm_inputs[key] = None + + return sliced_mm_inputs + + def prepare_4d_attention_mask(attention_mask_with_indices: "torch.Tensor", dtype: "torch.dtype") -> "torch.Tensor": r"""Expand 2d attention mask to 4d attention mask. @@ -106,9 +157,154 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): else: self.get_rope_func = None + def _compute_rope_position_ids( + self, features: dict[str, "torch.Tensor"], mm_inputs: dict[str, Any] + ) -> None: + r"""Compute position_ids and rope_deltas via get_rope_func for VLMs.""" + rope_index_kwargs = { + "input_ids": features["input_ids"], + "image_grid_thw": mm_inputs.get("image_grid_thw"), + "video_grid_thw": mm_inputs.get("video_grid_thw"), + "attention_mask": (features["attention_mask"] >= 1).float(), + } + if features["attention_mask"].sum() == 0: + features["position_ids"] = torch.zeros((3, *features["input_ids"].shape)) + features["rope_deltas"] = torch.zeros(features["input_ids"].shape[0]) + return + + if "mm_token_type_ids" in inspect.signature(self.get_rope_func).parameters: + image_token_id = getattr(self.model.config, "image_token_id", None) + video_token_id = getattr(self.model.config, "video_token_id", None) + if image_token_id is not None or video_token_id is not None: + mm_token_type_ids = torch.zeros_like(features["input_ids"]) + if image_token_id is not None: + mm_token_type_ids[features["input_ids"] == image_token_id] = 1 + if video_token_id is not None: + mm_token_type_ids[features["input_ids"] == video_token_id] = 2 + rope_index_kwargs["mm_token_type_ids"] = mm_token_type_ids + + if "second_per_grid_ts" in mm_inputs: # for qwen2vl + rope_index_kwargs["second_per_grid_ts"] = mm_inputs.get("second_per_grid_ts") + elif "video_second_per_grid" in mm_inputs: # for qwen2.5 omni + rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid") + + if getattr(self.model.config, "model_type", None) in ["qwen2_5_omni_thinker", "qwen3_omni_moe_thinker"]: + rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False) + feature_attention_mask = mm_inputs.get("feature_attention_mask", None) + if feature_attention_mask is not None: # FIXME: need to get video image lengths + audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) + rope_index_kwargs["audio_seqlens"] = audio_feature_lengths # prepare for input + + features["position_ids"], rope_deltas = self.get_rope_func(**rope_index_kwargs) + features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum( + dim=-1 + ).unsqueeze(-1) + else: # for qwen vl + features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs) + + def _compute_rope_position_ids_with_packing( + self, + features: dict[str, "torch.Tensor"], + mm_inputs: dict[str, Any], + packing_params_list: list[dict[str, Any] | None], + batch_imglens: list[int], + batch_vidlens: list[int], + batch_audlens: list[int], + has_dummy_image: bool, + ) -> None: + r"""Compute position_ids and rope_deltas per sample (or per sub-sequence when packed), then merge and validate.""" + bsz = features["input_ids"].size(0) + seq_len = features["input_ids"].size(1) + all_position_ids: list[torch.Tensor] = [] + all_rope_deltas: list[torch.Tensor] = [] + + if has_dummy_image: + # for [0, seq_len] = [0, unpadded_length + right_padding_length + fake_input_ids_len + collator_padding_length] + # FIXME: maybe right_padding_length is large, with improper max_cutoff_len + unpadded_length = int(features["attention_mask"][0].bool().sum().item()) + right_padding_length = int((packing_params_list[0] or {}).get("right_padding_length") or 0) + fake_input_padding_length = max(0, seq_len - unpadded_length - right_padding_length) + dummy_image_right_padding_mrope = torch.zeros((3, bsz, fake_input_padding_length)) + dummy_image_right_padding_attention_mask = torch.zeros((bsz, fake_input_padding_length)) + assert self.tokenizer.padding_side == "right", "padding_side should be right when fake image is injected" + dummy_mm_inputs = copy.deepcopy(mm_inputs) + + for sample_idx in range(bsz): + sample_packing = (packing_params_list[sample_idx] or {}) if sample_idx < len(packing_params_list) else {} + sequence_boundaries = sample_packing.get("sequence_boundaries") + num_sub_seqs = (len(sequence_boundaries) - 1) if sequence_boundaries and len(sequence_boundaries) > 1 else 1 + image_subseq_ids = sample_packing.get("image_subseq_ids") or [] + video_subseq_ids = sample_packing.get("video_subseq_ids") or [] + images_per_subseq = ( + [image_subseq_ids.count(i) for i in range(num_sub_seqs)] if image_subseq_ids and num_sub_seqs > 1 else None + ) + videos_per_subseq = ( + [video_subseq_ids.count(i) for i in range(num_sub_seqs)] if video_subseq_ids and num_sub_seqs > 1 else None + ) + if has_dummy_image: + mm_inputs = {} + + if num_sub_seqs <= 1: + sample_features = { + "input_ids": features["input_ids"], + "attention_mask": features["attention_mask"][sample_idx : sample_idx + 1], + } + mm_inputs_for_sample = _slice_mm_inputs_for_sample( + mm_inputs, batch_imglens, batch_vidlens, sample_idx=sample_idx + ) + self._compute_rope_position_ids(sample_features, mm_inputs_for_sample) + all_position_ids.append(sample_features["position_ids"]) + all_rope_deltas.append(sample_features["rope_deltas"]) + else: + # when we do packing, don't need rope_deltas when training. + sample_position_ids: list[torch.Tensor] = [] + for subseq_idx in range(num_sub_seqs): + subseq_start = sequence_boundaries[subseq_idx] + subseq_end = sequence_boundaries[subseq_idx + 1] + subseq_features = { + "input_ids": features["input_ids"][sample_idx : sample_idx + 1, subseq_start:subseq_end], + "attention_mask": features["attention_mask"][sample_idx : sample_idx + 1, subseq_start:subseq_end], + } + mm_inputs_for_subseq = _slice_mm_inputs_for_sample( + mm_inputs, + batch_imglens, + batch_vidlens, + sample_idx, + images_per_subseq, + videos_per_subseq, + subseq_idx + ) + self._compute_rope_position_ids(subseq_features, mm_inputs_for_subseq) + sample_position_ids.append(subseq_features["position_ids"]) + all_position_ids.append(torch.cat(sample_position_ids, dim=-1)) + + batch_dim_for_position_ids = 1 if all_position_ids[0].dim() == 3 else 0 + + features["position_ids"] = torch.cat(all_position_ids, dim=batch_dim_for_position_ids) + if has_dummy_image: + mm_inputs = dummy_mm_inputs + + expected_position_ids_shape = (bsz, seq_len) if all_position_ids[0].dim() == 2 else ( + all_position_ids[0].size(0), + bsz, + seq_len, + ) + # Check if position_ids shape matches expected shape. + # for further usage, we should padding to the right when some padding token on the right. + if has_dummy_image: + features["position_ids"] = torch.cat([features["position_ids"], dummy_image_right_padding_mrope], dim=-1) + features["attention_mask"] = torch.cat([features["attention_mask"], dummy_image_right_padding_attention_mask], dim=-1) + + if features["position_ids"].shape != expected_position_ids_shape: + raise ValueError( + "Merged position_ids shape mismatch: " + f"got {features['position_ids'].shape}, expected {expected_position_ids_shape}." + ) + def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]: batch_images, batch_videos, batch_audios = [], [], [] batch_imglens, batch_vidlens, batch_audlens, batch_input_ids = [], [], [], [] + packing_params_list: list[dict[str, Any] | None] = [] for feature in features: images = feature.pop("images", None) or [] videos = feature.pop("videos", None) or [] @@ -120,8 +316,10 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): batch_vidlens.append(len(videos)) batch_audlens.append(len(audios)) batch_input_ids.append(feature["input_ids"]) + packing_params_list.append(feature.pop("packing_params", None)) fake_input_ids = [] + has_dummy_image = False if ( self.template.mm_plugin.image_token is not None and sum(batch_imglens) == 0 and sum(batch_vidlens) == 0 ): # avoid process hanging in zero3/fsdp case @@ -137,6 +335,7 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): fake_input_ids.extend(_fake_input_ids) batch_images = fake_images batch_imglens[0] = 1 + has_dummy_image = True if ( self.template.mm_plugin.audio_token is not None and sum(batch_audlens) == 0 @@ -183,57 +382,50 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): features: dict[str, torch.Tensor] = super().__call__(features) + bsz, seq_len = features["input_ids"].shape[:2] + model_type = getattr(self.model.config, "model_type", None) if self.model is not None else None + is_omni = model_type in [ + "qwen2_5_omni_thinker", + "qwen3_omni_moe_thinker", + ] + if self.get_rope_func is not None: - rope_index_kwargs = { - "input_ids": features["input_ids"], - "image_grid_thw": mm_inputs.get("image_grid_thw"), - "video_grid_thw": mm_inputs.get("video_grid_thw"), - "attention_mask": (features["attention_mask"] >= 1).float(), - } - if "mm_token_type_ids" in inspect.signature(self.get_rope_func).parameters: - image_token_id = getattr(self.model.config, "image_token_id", None) - video_token_id = getattr(self.model.config, "video_token_id", None) - if image_token_id is not None or video_token_id is not None: - mm_token_type_ids = torch.zeros_like(features["input_ids"]) - if image_token_id is not None: - mm_token_type_ids[features["input_ids"] == image_token_id] = 1 - if video_token_id is not None: - mm_token_type_ids[features["input_ids"] == video_token_id] = 2 - rope_index_kwargs["mm_token_type_ids"] = mm_token_type_ids - if "second_per_grid_ts" in mm_inputs: # for qwen2vl - rope_index_kwargs["second_per_grid_ts"] = mm_inputs.get("second_per_grid_ts") - elif "video_second_per_grid" in mm_inputs: # for qwen2.5 omni - rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid") + # for mmrope situation, we should calculate position_ids and rope_deltas per sample. + # When neat_packing is on, each sample has packing_params; None means no packing for that sample. + boundaries_list = [ + p.get("sequence_boundaries") if p is not None else None for p in packing_params_list + ] + has_packing = any(b is not None and len(b) > 2 for b in boundaries_list) + if has_dummy_image and has_packing: + # FIXME: too tricky, need to be refactored + features["has_dummy_image"] = True - if getattr(self.model.config, "model_type", None) in ["qwen2_5_omni_thinker", "qwen3_omni_moe_thinker"]: - rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False) - feature_attention_mask = mm_inputs.get("feature_attention_mask", None) - if feature_attention_mask is not None: # FIXME: need to get video image lengths - audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) - rope_index_kwargs["audio_seqlens"] = audio_feature_lengths # prepare for input + # When fake image/audio was injected, sequence_boundaries no longer match the tensor; use non-packing path. + if not has_packing: + self._compute_rope_position_ids(features, mm_inputs) + else: + if is_omni: + raise RuntimeError("Omni models are not supported for packed sequences for now.") - features["position_ids"], rope_deltas = self.get_rope_func(**rope_index_kwargs) - features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum( - dim=-1 - ).unsqueeze(-1) - else: # for qwen vl - features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs) + self._compute_rope_position_ids_with_packing( + features, + mm_inputs, + packing_params_list, + batch_imglens, + batch_vidlens, + batch_audlens, + has_dummy_image, + ) + + # For transformers compatibility, after https://github.com/huggingface/transformers/issues/39400 + if features["position_ids"].dim() == 3: + features["position_ids"] = torch.cat( + [features["position_ids"][0].unsqueeze(0), features["position_ids"]], dim=0 + ) if ( self.model is not None - and getattr(self.model.config, "model_type", None) - in [ - "glm4v", - "glm_ocr", - "Keye", - "qwen2_vl", - "qwen2_5_vl", - "qwen2_5_omni_thinker", - "qwen3_omni_moe_thinker", - "qwen3_5", - "qwen3_vl", - "qwen3_vl_moe", - ] + and getattr(self.model.config, "model_type", None) in MROPE_MODELS and ("position_ids" not in features or features["position_ids"].dim() != 3) ): raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.") @@ -261,12 +453,51 @@ class SFTDataCollatorWith4DAttentionMask(MultiModalDataCollatorForSeq2Seq): block_diag_attn: bool = False attn_implementation: Literal["eager", "sdpa", "flash_attention_2"] = "eager" compute_dtype: "torch.dtype" = torch.float32 + neat_packing: bool = False + + def __post_init__(self): + super().__post_init__() + if self.neat_packing and self.attn_implementation == "flash_attention_2": + if self.model is not None and getattr(self.model.config, "model_type", None) in ["qwen3_5", "qwen3_5_moe", "gpt_oss"]: + raise ValueError("Neat packing is not supported for qwen3_5, qwen3_5_moe, gpt_oss models for now.") + + @staticmethod + def _unpad_packed_features(features: dict[str, Any]) -> None: + r"""Trim padded positions for packed FA2 batches.""" + attention_mask = features.get("attention_mask") + if not torch.is_tensor(attention_mask) or attention_mask.dim() != 2 or attention_mask.size(0) != 1: + return + + seq_len = attention_mask.size(1) + non_padding_indices = torch.nonzero(attention_mask[0] != 0, as_tuple=False).flatten() + if non_padding_indices.numel() == seq_len: + return + + keys_on_seq_dim_1 = {"input_ids", "labels", "attention_mask", "token_type_ids"} + for key, value in list(features.items()): + if not torch.is_tensor(value): + continue + + if key == "position_ids" and value.size(-1) == seq_len: + features[key] = value.index_select(-1, non_padding_indices) + elif key == "cross_attention_mask" and value.dim() >= 2 and value.size(0) == 1 and value.size(1) == seq_len: + features[key] = value.index_select(1, non_padding_indices) + elif key in keys_on_seq_dim_1 and value.dim() == 2 and value.size(0) == 1 and value.size(1) == seq_len: + features[key] = value.index_select(1, non_padding_indices) def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]: features = super().__call__(features) + has_dummy_image = features.pop("has_dummy_image", False) if self.block_diag_attn and self.attn_implementation != "flash_attention_2": features["attention_mask"] = prepare_4d_attention_mask(features["attention_mask"], self.compute_dtype) + if self.neat_packing and self.attn_implementation == "flash_attention_2": # FIXME compatibility fa3/fa4 + assert features["input_ids"].shape[0] == 1, "bsz should be 1 for neat packing" + if not has_dummy_image: + self._unpad_packed_features(features) + + features["attention_mask"] = None # let transformers handle causal packed mask. + for key, value in features.items(): # cast data dtype for paligemma if torch.is_tensor(value) and torch.is_floating_point(value): features[key] = value.to(self.compute_dtype) diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py index 6b400cfd9..badd29ac6 100644 --- a/src/llamafactory/data/mm_plugin.py +++ b/src/llamafactory/data/mm_plugin.py @@ -27,11 +27,12 @@ from typing import TYPE_CHECKING, BinaryIO, Literal, NotRequired, Optional, Type import numpy as np import torch import torchaudio -from transformers.image_utils import get_image_size, is_valid_image, to_numpy_array +from transformers.image_utils import get_image_size, is_valid_image, make_flat_list_of_images, to_numpy_array from transformers.models.mllama.processing_mllama import ( convert_sparse_cross_attention_mask_to_dense, get_cross_attention_token_mask, ) +from transformers.video_utils import make_batched_videos from typing_extensions import override from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER @@ -47,13 +48,6 @@ if is_pyav_available(): import av -if is_transformers_version_greater_than("4.52.0"): - from transformers.image_utils import make_flat_list_of_images - from transformers.video_utils import make_batched_videos -else: - from transformers.image_utils import make_batched_videos, make_flat_list_of_images - - if TYPE_CHECKING: from av.stream import Stream from numpy.typing import NDArray diff --git a/src/llamafactory/data/processor/supervised.py b/src/llamafactory/data/processor/supervised.py index b5aba11b6..25eb82adb 100644 --- a/src/llamafactory/data/processor/supervised.py +++ b/src/llamafactory/data/processor/supervised.py @@ -13,7 +13,7 @@ # limitations under the License. from collections import defaultdict -from dataclasses import dataclass +from dataclasses import asdict, dataclass from typing import TYPE_CHECKING, Any, Optional from ...extras import logging @@ -27,6 +27,23 @@ if TYPE_CHECKING: logger = logging.get_logger(__name__) +MAX_SU_SEQ_IDX = 2**32 # maximum sub-sequence index + +@dataclass +class PackingParams: + r"""Metadata for a packed sequence: sub-sequence boundaries and multimodal data indices. + + - sequence_boundaries: cumulative token positions, e.g. [0, 100, 250, 512] means 3 sub-seqs + with token ranges [0,100), [100,250), [250,512). Length = num_sub_seqs + 1. + - image_subseq_ids / video_subseq_ids / audio_subseq_ids: for each mm item, the 0-based + sub-sequence index it belongs to. Length = total number of that mm type in the packed sample. + """ + + sequence_boundaries: list[int] + image_subseq_ids: list[int] + video_subseq_ids: list[int] + audio_subseq_ids: list[int] + right_padding_length: int @dataclass class SupervisedDatasetProcessor(DatasetProcessor): @@ -162,10 +179,17 @@ class PackedSupervisedDatasetProcessor(SupervisedDatasetProcessor): valid_num += 1 model_inputs = defaultdict(list) + requires_packing_params = self.data_args.neat_packing knapsacks = greedy_knapsack(lengths, self.data_args.cutoff_len) for knapsack in knapsacks: packed_input_ids, packed_attention_masks, packed_position_ids, packed_labels = [], [], [], [] packed_images, packed_videos, packed_audios = [], [], [] + if requires_packing_params: + sequence_boundaries = [0] + image_subseq_ids: list[int] = [] + video_subseq_ids: list[int] = [] + audio_subseq_ids: list[int] = [] + for i, length in enumerate(knapsack): index = length2indexes[length].pop() packed_input_ids += batch_input_ids[index] @@ -174,6 +198,15 @@ class PackedSupervisedDatasetProcessor(SupervisedDatasetProcessor): packed_images += batch_images[index] packed_videos += batch_videos[index] packed_audios += batch_audios[index] + if requires_packing_params: + n_img = len(batch_images[index]) + n_vid = len(batch_videos[index]) + n_aud = len(batch_audios[index]) + sequence_boundaries.append(sequence_boundaries[-1] + len(batch_input_ids[index])) + image_subseq_ids.extend([i] * n_img) + video_subseq_ids.extend([i] * n_vid) + audio_subseq_ids.extend([i] * n_aud) + if self.data_args.neat_packing: packed_attention_masks += [i + 1] * len(batch_input_ids[index]) # start from 1 else: @@ -189,10 +222,23 @@ class PackedSupervisedDatasetProcessor(SupervisedDatasetProcessor): else: packed_attention_masks += [1] * pad_length # more efficient flash_attn + if requires_packing_params: + sequence_boundaries.append(sequence_boundaries[-1] + pad_length) + if len(packed_input_ids) != self.data_args.cutoff_len + 1: raise ValueError("The length of packed example should be identical to the cutoff length.") model_inputs["input_ids"].append(packed_input_ids) + if requires_packing_params: + packing_params = PackingParams( + sequence_boundaries=sequence_boundaries, + image_subseq_ids=image_subseq_ids or [MAX_SU_SEQ_IDX], # avoid dataset concat error + video_subseq_ids=video_subseq_ids or [MAX_SU_SEQ_IDX], + audio_subseq_ids=audio_subseq_ids or [MAX_SU_SEQ_IDX], + right_padding_length=pad_length, + ) + model_inputs["packing_params"].append(asdict(packing_params)) + model_inputs["attention_mask"].append(packed_attention_masks) model_inputs["position_ids"].append(packed_position_ids) model_inputs["labels"].append(packed_labels) diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 0d2cae65b..87fb1f1c5 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -77,6 +77,20 @@ METHODS = ["full", "freeze", "lora", "oft"] MOD_SUPPORTED_MODELS = {"bloom", "falcon", "gemma", "llama", "mistral", "mixtral", "phi", "starcoder2"} +MROPE_MODELS = { + "glm4v", + "glm_ocr", + "Keye", + "qwen2_vl", + "qwen2_5_vl", + "qwen2_5_omni_thinker", + "qwen3_omni_moe_thinker", + "qwen3_vl", + "qwen3_vl_moe", + "qwen3_5", + "qwen3_5_moe", +} + MULTIMODAL_SUPPORTED_MODELS = set() PEFT_METHODS = {"lora", "oft"} diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index 8214267ed..c1f5e6ae2 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -94,7 +94,7 @@ def check_version(requirement: str, mandatory: bool = False) -> None: def check_dependencies() -> None: r"""Check the version of the required packages.""" - check_version("transformers>=4.51.0,<=5.2.0") + check_version("transformers>=4.55.0,<=5.2.0") check_version("datasets>=2.16.0,<=4.0.0") check_version("accelerate>=1.3.0,<=1.11.0") check_version("peft>=0.18.0,<=0.18.1") diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py index ab47a088a..e01e9d782 100644 --- a/src/llamafactory/hparams/parser.py +++ b/src/llamafactory/hparams/parser.py @@ -33,7 +33,7 @@ from transformers.utils import is_torch_bf16_gpu_available, is_torch_npu_availab from ..extras import logging from ..extras.constants import CHECKPOINT_NAMES, EngineName from ..extras.misc import check_dependencies, check_version, get_current_device, is_env_enabled -from ..extras.packages import is_mcore_adapter_available, is_transformers_version_greater_than +from ..extras.packages import is_mcore_adapter_available from .data_args import DataArguments from .evaluation_args import EvaluationArguments from .finetuning_args import FinetuningArguments @@ -394,9 +394,6 @@ def get_train_args(args: dict[str, Any] | list[str] | None = None) -> _TRAIN_CLS if model_args.use_kt and is_deepspeed_zero3_enabled(): raise ValueError("KTransformers is incompatible with DeepSpeed ZeRO-3.") - if data_args.neat_packing and is_transformers_version_greater_than("4.53.0"): - raise ValueError("Neat packing is incompatible with transformers>=4.53.0.") - _set_env_vars() _verify_model_args(model_args, data_args, finetuning_args) _check_extra_dependencies(model_args, finetuning_args, training_args) diff --git a/src/llamafactory/model/model_utils/packing.py b/src/llamafactory/model/model_utils/packing.py index 66409f386..edbb30a6c 100644 --- a/src/llamafactory/model/model_utils/packing.py +++ b/src/llamafactory/model/model_utils/packing.py @@ -37,7 +37,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import TYPE_CHECKING import torch import torch.nn.functional as F @@ -45,10 +44,6 @@ import torch.nn.functional as F from ...extras import logging -if TYPE_CHECKING: - from ...hparams import ModelArguments - - logger = logging.get_logger(__name__) @@ -105,13 +100,3 @@ def get_unpad_data(attention_mask: "torch.Tensor") -> tuple["torch.Tensor", "tor max_seqlen_in_batch = seqlens_in_batch.max().item() cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) return indices, cu_seqlens, max_seqlen_in_batch - - -def configure_packing(model_args: "ModelArguments", is_trainable: bool) -> None: - if not is_trainable or not model_args.block_diag_attn: - return - - import transformers.modeling_flash_attention_utils - - transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data - logger.info_rank0("Using block diagonal attention for sequence packing without cross-attention.") diff --git a/src/llamafactory/model/model_utils/visual.py b/src/llamafactory/model/model_utils/visual.py index 4a855f31a..3346d7fc1 100644 --- a/src/llamafactory/model/model_utils/visual.py +++ b/src/llamafactory/model/model_utils/visual.py @@ -24,7 +24,6 @@ import transformers.models from transformers.activations import ACT2FN from ...extras import logging -from ...extras.packages import is_transformers_version_greater_than if TYPE_CHECKING: @@ -344,9 +343,7 @@ _register_composite_model( model_type="qwen2_vl", projector_key="visual.merger", vision_model_keys=["visual.patch_embed", "visual.blocks"], - language_model_keys=["language_model", "lm_head"] - if is_transformers_version_greater_than("4.52.0") - else ["model", "lm_head"], + language_model_keys=["language_model", "lm_head"], lora_conflict_keys=["patch_embed"], ) @@ -355,9 +352,7 @@ _register_composite_model( model_type="qwen2_5_vl", projector_key="visual.merger", vision_model_keys=["visual.patch_embed", "visual.blocks"], - language_model_keys=["language_model", "lm_head"] - if is_transformers_version_greater_than("4.52.0") - else ["model", "lm_head"], + language_model_keys=["language_model", "lm_head"], lora_conflict_keys=["patch_embed"], ) diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py index 52e8ace21..375216f16 100644 --- a/src/llamafactory/model/patcher.py +++ b/src/llamafactory/model/patcher.py @@ -30,7 +30,6 @@ from .model_utils.embedding import resize_embedding_layer from .model_utils.kv_cache import configure_kv_cache from .model_utils.longlora import configure_longlora from .model_utils.moe import add_z3_leaf_module, configure_moe -from .model_utils.packing import configure_packing from .model_utils.quantization import configure_quantization from .model_utils.rope import configure_rope from .model_utils.valuehead import prepare_valuehead_model @@ -142,7 +141,6 @@ def patch_config( configure_quantization(config, tokenizer, model_args, is_trainable, init_kwargs) configure_moe(config, model_args, is_trainable) configure_visual_model(config) - configure_packing(model_args, is_trainable) configure_kv_cache(config, model_args, is_trainable) if getattr(config, "model_type", None) == "qwen": diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py index e3dacf798..dfed95a5c 100644 --- a/src/llamafactory/train/sft/trainer.py +++ b/src/llamafactory/train/sft/trainer.py @@ -214,7 +214,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer): pad_len = np.nonzero(preds[i] != self.processing_class.pad_token_id)[0] if len(pad_len): # move pad token to last preds[i] = np.concatenate((preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1) - + input_ids_column = dataset["input_ids"] try: input_ids_list = input_ids_column.to_pylist() diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index db625028f..3561ecb30 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -65,6 +65,7 @@ def run_sft( pad_to_multiple_of=8 if training_args.do_train else None, # for shift short attention label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id, block_diag_attn=model_args.block_diag_attn, + neat_packing=data_args.neat_packing, attn_implementation=getattr(model.config, "_attn_implementation", None), compute_dtype=model_args.compute_dtype, **tokenizer_module, diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index 9e14c763e..0d89e8a31 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -50,9 +50,9 @@ if is_apollo_available(): if is_ray_available(): import ray - from ray.util.state import list_nodes from ray.util.placement_group import PlacementGroup, placement_group from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + from ray.util.state import list_nodes if TYPE_CHECKING: diff --git a/tests/data/test_collator.py b/tests/data/test_collator.py index d62ee7a48..773fd25b7 100644 --- a/tests/data/test_collator.py +++ b/tests/data/test_collator.py @@ -13,6 +13,7 @@ # limitations under the License. import os +from collections import Counter import pytest import torch @@ -129,9 +130,177 @@ def test_multimodal_collator(): assert batch_input.keys() == expected_input.keys() for k in batch_input.keys(): + if k == "position_ids" and batch_input[k].dim() == 3 and batch_input[k].shape[0] == 4: + batch_input[k] = batch_input[k][1:] + assert batch_input[k].eq(torch.tensor(expected_input[k])).all() +def _make_packed_feature( + *, + packing_params: dict, + pad_token_id: int, + label_ignore_id: int, + fake_image: Image.Image, + vision_start_id: int | None = None, + vision_end_id: int | None = None, + image_pad_id: int | None = None, +) -> dict: + r"""Build one packed sample using the new PackingParams schema.""" + sequence_boundaries = packing_params["sequence_boundaries"] + image_subseq_ids = packing_params["image_subseq_ids"] + video_subseq_ids = packing_params["video_subseq_ids"] + audio_subseq_ids = packing_params["audio_subseq_ids"] + unpadded_length = packing_params["unpadded_length"] + right_padding_length = packing_params["right_padding_length"] # which only preserved in tests + cutoff_plus_one = sequence_boundaries[-1] + content_len = unpadded_length + pad_len = right_padding_length + assert content_len + pad_len == cutoff_plus_one + assert sequence_boundaries[0] == 0 + assert sequence_boundaries[-1] == cutoff_plus_one + + content_ids = list(range(100, 100 + content_len)) + if vision_start_id is not None and vision_end_id is not None and image_pad_id is not None: + image_counts_by_subseq = Counter(image_subseq_ids) + for subseq_idx, image_count in sorted(image_counts_by_subseq.items()): + if subseq_idx >= len(sequence_boundaries) - 1: + continue + + subseq_start = sequence_boundaries[subseq_idx] + subseq_end = sequence_boundaries[subseq_idx + 1] + subseq_len = subseq_end - subseq_start + if subseq_len < 3: + continue + + # Build repeated image groups while preserving at least 3 tokens for each remaining image. + injected_tokens: list[int] = [] + remaining = subseq_len + for image_idx in range(image_count): + remaining_images = image_count - image_idx + min_reserved_for_rest = 3 * (remaining_images - 1) + current_group_len = min(6, remaining - min_reserved_for_rest) + if current_group_len < 3: + break + + group = [vision_start_id] + [image_pad_id] * max(1, current_group_len - 2) + [vision_end_id] + injected_tokens.extend(group[:current_group_len]) + remaining -= current_group_len + + if injected_tokens: + insert_end = subseq_start + len(injected_tokens) + content_ids[subseq_start:insert_end] = injected_tokens + + input_ids = content_ids + [pad_token_id] * pad_len + attention_mask = [1] * content_len + [0] * pad_len + labels = [label_ignore_id] * cutoff_plus_one + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels, + "images": [fake_image] * len(image_subseq_ids), + "videos": [None] * len(video_subseq_ids), + "audios": [None] * len(audio_subseq_ids), + "packing_params": packing_params, + } + + +def _make_packed_features( + *, + packing_params: dict, + pad_token_id: int, + label_ignore_id: int, + fake_image: Image.Image, + vision_start_id: int, + vision_end_id: int, + image_pad_id: int, +) -> list[dict]: + r"""Build packed features from caller-provided packing_params.""" + return [ + _make_packed_feature( + packing_params=packing_params, + pad_token_id=pad_token_id, + label_ignore_id=label_ignore_id, + fake_image=fake_image, + vision_start_id=vision_start_id, + vision_end_id=vision_end_id, + image_pad_id=image_pad_id, + ) + ] + +def _get_expected_position_ids(packing_params, get_rope_func, input_ids, attention_mask) -> torch.Tensor: + bound_list = packing_params["sequence_boundaries"] + input_ids_slices = [input_ids[bound_list[i]:bound_list[i+1]] for i in range(len(bound_list) - 1)] + attention_mask_slices = [attention_mask[bound_list[i]:bound_list[i+1]] for i in range(len(bound_list) - 1)] + img_counts_by_subseq = Counter(packing_params["image_subseq_ids"]) + all_position_ids = [] + for i, input_ids_slice in enumerate(input_ids_slices): + img_cnt = img_counts_by_subseq[i] + if sum(attention_mask_slices[i]) == 0: + continue + + rope_func_kwargs = { + "input_ids": torch.tensor(input_ids_slice).unsqueeze(0), + "attention_mask": torch.tensor(attention_mask_slices[i]).unsqueeze(0), + "image_grid_thw": [torch.tensor([1, 4, 4])] * img_cnt, + } + position_ids, _ = get_rope_func(**rope_func_kwargs) + all_position_ids.append(position_ids) + + return torch.cat(all_position_ids, dim=-1) + + +@pytest.mark.runs_on(["cpu", "mps"]) +def test_multimodal_collator_with_packing(): + model_args, data_args, *_ = get_infer_args( + {"model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct", "template": "qwen2_vl"} + ) + tokenizer_module = load_tokenizer(model_args) + template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args) + tokenizer_module["tokenizer"].padding_side = "right" + config = AutoConfig.from_pretrained(model_args.model_name_or_path) + with torch.device("meta"): + model = AutoModelForImageTextToText.from_config(config) + + data_collator = MultiModalDataCollatorForSeq2Seq( + template=template, + model=model, + pad_to_multiple_of=4, + label_pad_token_id=IGNORE_INDEX, + **tokenizer_module, + ) + + tokenizer = tokenizer_module["tokenizer"] + packing_params = { + "sequence_boundaries": [0, 2, 10, 18, 28, 32], + "image_subseq_ids": [1, 2, 3], + "video_subseq_ids": [], + "audio_subseq_ids": [], + "unpadded_length": 28, + "right_padding_length": 4, + } + fake_image = Image.new("RGB", (64, 64), (255, 255, 255)) + features = _make_packed_features( + packing_params=packing_params, + pad_token_id=tokenizer.pad_token_id, + label_ignore_id=IGNORE_INDEX, + fake_image=fake_image, + vision_start_id=tokenizer.convert_tokens_to_ids("<|vision_start|>"), + vision_end_id=tokenizer.convert_tokens_to_ids("<|vision_end|>"), + image_pad_id=tokenizer.convert_tokens_to_ids("<|image_pad|>"), + ) + expected_position_ids = _get_expected_position_ids( + packing_params, + data_collator.get_rope_func, + features[0]["input_ids"], + features[0]["attention_mask"], + ) + batch_input = data_collator(features) # [3, bsz, seq_len] + valid_len = expected_position_ids.shape[-1] + assert batch_input["position_ids"][1:, :, :valid_len].eq(expected_position_ids).all() + + @pytest.mark.runs_on(["cpu"]) def test_4d_attention_mask(): o = 0.0