[breaking] bump transformers to 4.45.0 & improve ci (#7746)

* update ci

* fix

* fix

* fix

* fix

* fix
This commit is contained in:
hoshi-hiyouga
2025-04-17 02:36:48 +08:00
committed by GitHub
parent d222f63cb7
commit 86ebb219d6
23 changed files with 211 additions and 140 deletions

View File

@@ -18,7 +18,6 @@ from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_availabl
from ...extras import logging
from ...extras.constants import AttentionFunction
from ...extras.misc import check_version
if TYPE_CHECKING:
@@ -36,8 +35,6 @@ def configure_attn_implementation(
if getattr(config, "model_type", None) == "gemma2" and is_trainable:
if model_args.flash_attn == AttentionFunction.AUTO or model_args.flash_attn == AttentionFunction.FA2:
if is_flash_attn_2_available():
check_version("transformers>=4.42.4")
check_version("flash_attn>=2.6.3")
if model_args.flash_attn != AttentionFunction.FA2:
logger.warning_rank0("Gemma 2 should use flash attention 2, change `flash_attn` to fa2.")
model_args.flash_attn = AttentionFunction.FA2

View File

@@ -350,7 +350,7 @@ def llama_sdpa_attention_forward(
def _apply_llama_patch() -> None:
check_version("transformers>=4.41.2,<4.48.0")
check_version("transformers>=4.43.0,<4.48.0", mandatory=True)
LlamaAttention.forward = llama_attention_forward
LlamaFlashAttention2.forward = llama_flash_attention_2_forward
LlamaSdpaAttention.forward = llama_sdpa_attention_forward

View File

@@ -43,7 +43,6 @@ import torch
import torch.nn.functional as F
from ...extras import logging
from ...extras.misc import check_version
from ...extras.packages import is_transformers_version_greater_than
@@ -117,6 +116,5 @@ def configure_packing(model_args: "ModelArguments", is_trainable: bool) -> None:
if not is_trainable or not model_args.block_diag_attn:
return
check_version("transformers>=4.43.0")
transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data
logger.info_rank0("Using block diagonal attention for sequence packing without cross-attention.")