From 6ef9854713bf21f1fc32ff12bcf4cbd418a59757 Mon Sep 17 00:00:00 2001 From: Yaowei Zheng Date: Mon, 22 Dec 2025 00:20:55 +0800 Subject: [PATCH] [misc] fix cache & pin transformers to 4.57.1 (#9638) --- .github/workflows/docker.yml | 7 ++++--- .github/workflows/tests.yml | 1 + README.md | 27 +++++++-------------------- README_zh.md | 27 +++++++-------------------- requirements.txt | 2 +- src/llamafactory/data/template.py | 2 +- src/llamafactory/extras/constants.py | 14 +++++++++++++- src/llamafactory/extras/misc.py | 2 +- 8 files changed, 35 insertions(+), 47 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 06a5c6c08..3347ddc44 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -99,7 +99,7 @@ jobs: tags: | docker.io/hiyouga/llamafactory:${{ steps.version.outputs.tag }} cache-from: type=gha - cache-to: type=gha,mode=max + cache-to: type=gha,mode=min - name: Build and push Docker image (NPU-A2) if: ${{ matrix.device == 'npu' && matrix.npu_type == 'a2' }} @@ -113,7 +113,7 @@ jobs: docker.io/hiyouga/llamafactory:${{ steps.version.outputs.tag }}-npu-a2 quay.io/ascend/llamafactory:${{ steps.version.outputs.tag }}-npu-a2 cache-from: type=gha - cache-to: type=gha,mode=max + cache-to: type=gha,mode=min - name: Build and push Docker image (NPU-A3) if: ${{ matrix.device == 'npu' && matrix.npu_type == 'a3' }} @@ -129,4 +129,5 @@ jobs: docker.io/hiyouga/llamafactory:${{ steps.version.outputs.tag }}-npu-a3 quay.io/ascend/llamafactory:${{ steps.version.outputs.tag }}-npu-a3 cache-from: type=gha - cache-to: type=gha,mode=max + cache-to: type=gha,mode=min + # https://docs.docker.com/build/cache/backends/#cache-mode diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d1f629317..f53fd9114 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -73,6 +73,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + python -m pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu python -m pip install ".[torch,dev]" - name: Install transformers diff --git a/README.md b/README.md index 04c372394..b6fdfefef 100644 --- a/README.md +++ b/README.md @@ -278,27 +278,21 @@ Read technical notes: | Model | Model size | Template | | ----------------------------------------------------------------- | -------------------------------- | -------------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | | [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | | [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | +| [DeepSeek (LLM/Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [DeepSeek 3-3.2](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | | [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | | [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie/ernie_nothink | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 | +| [Falcon/Falcon H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/11B/34B/40B/180B | falcon/falcon_h1 | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 | | [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n | | [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 | -| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v | | [GLM-4.5/GLM-4.5(6)V](https://huggingface.co/zai-org) | 9B/106B/355B | glm4_moe/glm4_5v | | [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | -| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt | -| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | -| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 | +| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt_oss | +| [Granite 3-4](https://huggingface.co/ibm-granite) | 1B/2B/3B/7B/8B | granite3/granite4 | | [Hunyuan (MT)](https://huggingface.co/tencent/) | 7B | hunyuan | -| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | | [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | | [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl | | [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 | @@ -312,16 +306,13 @@ Read technical notes: | [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | | [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | | [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo | -| [MiMo-v2](https://huggingface.co/XiaomiMiMo) | 309B | mimo_v2 | +| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B/309B | mimo/mimo_v2 | | [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 | | [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | -| [Ministral(3)/Mistral-Nemo](https://huggingface.co/mistralai) | 3B/8B/12B/14B | ministral/ministral3 | +| [Ministral 3](https://huggingface.co/mistralai) | 3B/8B/14B | ministral3 | | [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | | [OLMo](https://huggingface.co/allenai) | 1B/7B | - | | [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | | [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | | [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | @@ -334,13 +325,9 @@ Read technical notes: | [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | | [Qwen3-VL](https://huggingface.co/Qwen) | 2B/4B/8B/30B/32B/235B | qwen3_vl | | [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder | -| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | | [VibeThinker-1.5B](https://huggingface.co/WeiboAI) | 1.5B | qwen3 | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | | [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | | [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] diff --git a/README_zh.md b/README_zh.md index fa3ebaab4..7995c88ce 100644 --- a/README_zh.md +++ b/README_zh.md @@ -280,27 +280,21 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc | 模型名 | 参数量 | Template | | ----------------------------------------------------------------- | -------------------------------- | -------------------- | -| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | | [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | -| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | | [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | -| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | -| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | +| [DeepSeek (LLM/Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [DeepSeek 3-3.2](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | | [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | | [ERNIE-4.5](https://huggingface.co/baidu) | 0.3B/21B/300B | ernie/ernie_nothink | -| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | -| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 | +| [Falcon/Falcon H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/11B/34B/40B/180B | falcon/falcon_h1 | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma/gemma2 | | [Gemma 3/Gemma 3n](https://huggingface.co/google) | 270M/1B/4B/6B/8B/12B/27B | gemma3/gemma3n | | [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org) | 9B/32B | glm4/glmz1 | -| [GLM-4.1V](https://huggingface.co/zai-org) | 9B | glm4v | | [GLM-4.5/GLM-4.5(6)V](https://huggingface.co/zai-org) | 9B/106B/355B | glm4_moe/glm4_5v | | [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | -| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt | -| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | -| [Granite 4](https://huggingface.co/ibm-granite) | 7B | granite4 | +| [GPT-OSS](https://huggingface.co/openai) | 20B/120B | gpt_oss | +| [Granite 3-4](https://huggingface.co/ibm-granite) | 1B/2B/3B/7B/8B | granite3/granite4 | | [Hunyuan (MT)](https://huggingface.co/tencent/) | 7B | hunyuan | -| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | | [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | | [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab) | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl | | [InternLM/Intern-S1-mini](https://huggingface.co/internlm/) | 8B | intern_s1 | @@ -314,16 +308,13 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc | [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | | [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | | [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | -| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo | -| [MiMo-v2](https://huggingface.co/XiaomiMiMo) | 309B | mimo_v2 | +| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B/309B | mimo/mimo_v2 | | [MiniCPM 1-4.1](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 | | [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | -| [Ministral(3)/Mistral-Nemo](https://huggingface.co/mistralai) | 3B/8B/12B/14B | ministral/ministral3 | +| [Ministral 3](https://huggingface.co/mistralai) | 3B/8B/14B | ministral3 | | [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | -| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | | [OLMo](https://huggingface.co/allenai) | 1B/7B | - | | [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | -| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | | [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | | [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | @@ -336,13 +327,9 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc | [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | | [Qwen3-VL](https://huggingface.co/Qwen) | 2B/4B/8B/30B/32B/235B | qwen3_vl | | [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed) | 8B/36B | seed_oss/seed_coder | -| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | -| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | | [VibeThinker-1.5B](https://huggingface.co/WeiboAI) | 1.5B | qwen3 | -| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | | [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | -| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | | [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | > [!NOTE] diff --git a/requirements.txt b/requirements.txt index 59105a9dd..0a273440f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ # core deps transformers>=4.49.0,<=4.56.2,!=4.52.0; python_version < '3.10' -transformers>=4.49.0,<=4.57.3,!=4.52.0,!=4.57.0; python_version >= '3.10' +transformers>=4.49.0,<=4.57.1,!=4.52.0,!=4.57.0; python_version >= '3.10' datasets>=2.16.0,<=4.0.0 accelerate>=1.3.0,<=1.11.0 peft>=0.14.0,<=0.17.1 diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 19d354f31..6a8a38b7f 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -1166,7 +1166,7 @@ register_template( register_template( - name="gpt", + name="gpt_oss", format_user=StringFormatter(slots=["<|start|>user<|message|>{{content}}<|end|><|start|>assistant"]), format_assistant=StringFormatter(slots=["{{content}}<|end|>"]), format_system=StringFormatter(slots=["<|start|>system<|message|>{{content}}<|end|>"]), diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 134cb5fd8..dcf989632 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1067,7 +1067,7 @@ register_model_group( DownloadSource.MODELSCOPE: "openai/gpt-oss-120b", }, }, - template="gpt", + template="gpt_oss", ) @@ -1995,6 +1995,18 @@ register_model_group( register_model_group( models={ + "Ministral-3-3B-Base-2512": { + DownloadSource.DEFAULT: "mistralai/Ministral-3-3B-Base-2512", + DownloadSource.MODELSCOPE: "mistralai/Ministral-3-3B-Base-2512", + }, + "Ministral-3-8B-Base-2512": { + DownloadSource.DEFAULT: "mistralai/Ministral-3-8B-Base-2512", + DownloadSource.MODELSCOPE: "mistralai/Ministral-3-8B-Base-2512", + }, + "Ministral-3-14B-Base-2512": { + DownloadSource.DEFAULT: "mistralai/Ministral-3-14B-Base-2512", + DownloadSource.MODELSCOPE: "mistralai/Ministral-3-14B-Base-2512", + }, "Ministral-3-3B-Instruct-2512": { DownloadSource.DEFAULT: "mistralai/Ministral-3-3B-Instruct-2512", DownloadSource.MODELSCOPE: "mistralai/Ministral-3-3B-Instruct-2512", diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index dc934bd26..5c4c24787 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -94,7 +94,7 @@ def check_version(requirement: str, mandatory: bool = False) -> None: def check_dependencies() -> None: r"""Check the version of the required packages.""" - check_version("transformers>=4.49.0,<=4.57.3") + check_version("transformers>=4.49.0,<=4.57.1") check_version("datasets>=2.16.0,<=4.0.0") check_version("accelerate>=1.3.0,<=1.11.0") check_version("peft>=0.14.0,<=0.17.1")