[breaking change] refactor data pipeline (#6901)

* refactor data

* rename file

Former-commit-id: 7a1a4ce6451cb782573d0bd9dd27a5e443e3a18b
This commit is contained in:
hoshi-hiyouga
2025-02-13 00:39:20 +08:00
committed by GitHub
parent 80b89978d9
commit 46203856fc
27 changed files with 1145 additions and 1132 deletions

View File

@@ -405,7 +405,7 @@ class Llama2Template(Template):
TEMPLATES: Dict[str, "Template"] = {}
def _register_template(
def register_template(
name: str,
format_user: Optional["Formatter"] = None,
format_assistant: Optional["Formatter"] = None,
@@ -421,7 +421,7 @@ def _register_template(
replace_eos: bool = False,
replace_jinja_template: bool = False,
mm_plugin: "BasePlugin" = get_mm_plugin(name="base"),
template_class: Type[Template] = Template,
template_class: Type["Template"] = Template,
) -> None:
r"""
Registers a chat template.
@@ -436,7 +436,7 @@ def _register_template(
The corresponding code should be:
```
_register_template(
register_template(
name="custom",
format_user=StringFormatter(slots=["<user>{{content}}\n<model>"]),
format_assistant=StringFormatter(slots=["{{content}}</s>\n"]),
@@ -444,6 +444,9 @@ def _register_template(
)
```
"""
if name in TEMPLATES:
raise ValueError(f"Template {name} already exists.")
default_slots = ["{{content}}"] if efficient_eos else ["{{content}}", {"eos_token"}]
default_user_formatter = StringFormatter(slots=["{{content}}"])
default_assistant_formatter = StringFormatter(slots=default_slots)
@@ -562,7 +565,7 @@ def get_template_and_fix_tokenizer(tokenizer: "PreTrainedTokenizer", data_args:
return template
_register_template(
register_template(
name="alpaca",
format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n\n"]),
@@ -573,7 +576,7 @@ _register_template(
)
_register_template(
register_template(
name="aquila",
format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]),
format_assistant=StringFormatter(slots=["{{content}}###"]),
@@ -586,7 +589,7 @@ _register_template(
)
_register_template(
register_template(
name="atom",
format_user=StringFormatter(
slots=[{"bos_token"}, "Human: {{content}}\n", {"eos_token"}, {"bos_token"}, "Assistant:"]
@@ -595,21 +598,21 @@ _register_template(
)
_register_template(
register_template(
name="baichuan",
format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
efficient_eos=True,
)
_register_template(
register_template(
name="baichuan2",
format_user=StringFormatter(slots=["<reserved_106>{{content}}<reserved_107>"]),
efficient_eos=True,
)
_register_template(
register_template(
name="belle",
format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n\n"]),
@@ -617,13 +620,13 @@ _register_template(
)
_register_template(
register_template(
name="bluelm",
format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
)
_register_template(
register_template(
name="breeze",
format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
@@ -631,7 +634,7 @@ _register_template(
)
_register_template(
register_template(
name="chatglm2",
format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]),
format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
@@ -639,7 +642,7 @@ _register_template(
)
_register_template(
register_template(
name="chatglm3",
format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
@@ -655,7 +658,7 @@ _register_template(
)
_register_template(
register_template(
name="chatml",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -668,7 +671,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="chatml_de",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -681,13 +684,13 @@ _register_template(
)
_register_template(
register_template(
name="codegeex2",
format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
)
_register_template(
register_template(
name="codegeex4",
format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
@@ -704,7 +707,7 @@ _register_template(
)
_register_template(
register_template(
name="cohere",
format_user=StringFormatter(
slots=[
@@ -719,7 +722,7 @@ _register_template(
)
_register_template(
register_template(
name="cpm",
format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
@@ -727,7 +730,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="cpm3",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -738,7 +741,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="dbrx",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -763,7 +766,7 @@ _register_template(
)
_register_template(
register_template(
name="deepseek",
format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
format_system=StringFormatter(slots=["{{content}}\n\n"]),
@@ -771,14 +774,14 @@ _register_template(
)
_register_template(
register_template(
name="deepseek3",
format_user=StringFormatter(slots=["<User>{{content}}<Assistant>"]),
format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
)
_register_template(
register_template(
name="deepseekcoder",
format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
format_assistant=StringFormatter(slots=["\n{{content}}\n<|EOT|>\n"]),
@@ -792,7 +795,7 @@ _register_template(
)
_register_template(
register_template(
name="default",
format_user=StringFormatter(slots=["Human: {{content}}\nAssistant:"]),
format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n"]),
@@ -800,13 +803,13 @@ _register_template(
)
_register_template(
register_template(
name="empty",
format_assistant=StringFormatter(slots=["{{content}}"]),
)
_register_template(
register_template(
name="exaone",
format_user=StringFormatter(slots=["[|user|]{{content}}\n[|assistant|]"]),
format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n"]),
@@ -814,7 +817,7 @@ _register_template(
)
_register_template(
register_template(
name="falcon",
format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
format_assistant=StringFormatter(slots=["{{content}}\n"]),
@@ -822,14 +825,14 @@ _register_template(
)
_register_template(
register_template(
name="fewshot",
format_assistant=StringFormatter(slots=["{{content}}\n\n"]),
efficient_eos=True,
)
_register_template(
register_template(
name="gemma",
format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
@@ -840,7 +843,7 @@ _register_template(
)
_register_template(
register_template(
name="glm4",
format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
format_assistant=StringFormatter(slots=["\n{{content}}"]),
@@ -854,7 +857,7 @@ _register_template(
)
_register_template(
register_template(
name="granite3",
format_user=StringFormatter(
slots=[
@@ -866,7 +869,7 @@ _register_template(
)
_register_template(
register_template(
name="index",
format_user=StringFormatter(slots=["reserved_0{{content}}reserved_1"]),
format_system=StringFormatter(slots=["<unk>{{content}}"]),
@@ -874,7 +877,7 @@ _register_template(
)
_register_template(
register_template(
name="intern",
format_user=StringFormatter(slots=["<|User|>:{{content}}\n<|Bot|>:"]),
format_assistant=StringFormatter(slots=["{{content}}<eoa>\n"]),
@@ -891,7 +894,7 @@ _register_template(
)
_register_template(
register_template(
name="intern2",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -908,7 +911,7 @@ _register_template(
)
_register_template(
register_template(
name="llama2",
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
@@ -917,7 +920,7 @@ _register_template(
# copied from llama2 template
_register_template(
register_template(
name="llama2_zh",
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
@@ -926,7 +929,7 @@ _register_template(
)
_register_template(
register_template(
name="llama3",
format_user=StringFormatter(
slots=[
@@ -954,7 +957,7 @@ _register_template(
# copied from llama3 template
_register_template(
register_template(
name="mllama",
format_user=StringFormatter(
slots=[
@@ -983,7 +986,7 @@ _register_template(
# copied from vicuna template
_register_template(
register_template(
name="llava",
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
default_system=(
@@ -995,7 +998,7 @@ _register_template(
# copied from vicuna template
_register_template(
register_template(
name="llava_next",
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
default_system=(
@@ -1007,7 +1010,7 @@ _register_template(
# copied from llama3 template
_register_template(
register_template(
name="llava_next_llama3",
format_user=StringFormatter(
slots=[
@@ -1036,7 +1039,7 @@ _register_template(
# copied from mistral template
_register_template(
register_template(
name="llava_next_mistral",
format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
@@ -1051,7 +1054,7 @@ _register_template(
# copied from qwen template
_register_template(
register_template(
name="llava_next_qwen",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1068,7 +1071,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="llava_next_yi",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1079,7 +1082,7 @@ _register_template(
# copied from vicuna template
_register_template(
register_template(
name="llava_next_video",
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
default_system=(
@@ -1091,7 +1094,7 @@ _register_template(
# copied from mistral template
_register_template(
register_template(
name="llava_next_video_mistral",
format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
@@ -1106,7 +1109,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="llava_next_video_yi",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1117,7 +1120,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="marco",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1133,7 +1136,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="minicpm_v",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1144,7 +1147,7 @@ _register_template(
# copied from minicpm_v template
_register_template(
register_template(
name="minicpm_o",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1155,7 +1158,7 @@ _register_template(
# mistral tokenizer v3 tekken
_register_template(
register_template(
name="ministral",
format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
format_system=StringFormatter(slots=["{{content}}\n\n"]),
@@ -1168,7 +1171,7 @@ _register_template(
# mistral tokenizer v3
_register_template(
register_template(
name="mistral",
format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
@@ -1182,7 +1185,7 @@ _register_template(
# mistral tokenizer v7 tekken (copied from ministral)
_register_template(
register_template(
name="mistral_small",
format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
format_system=StringFormatter(slots=["[SYSTEM_PROMPT]{{content}}[/SYSTEM_PROMPT]"]),
@@ -1193,21 +1196,21 @@ _register_template(
)
_register_template(
register_template(
name="olmo",
format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
format_prefix=EmptyFormatter(slots=[{"eos_token"}]),
)
_register_template(
register_template(
name="openchat",
format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
)
_register_template(
register_template(
name="openchat-3.6",
format_user=StringFormatter(
slots=[
@@ -1223,7 +1226,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="opencoder",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1234,7 +1237,7 @@ _register_template(
)
_register_template(
register_template(
name="orion",
format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
@@ -1242,7 +1245,7 @@ _register_template(
# copied from gemma template
_register_template(
register_template(
name="paligemma",
format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
@@ -1254,7 +1257,7 @@ _register_template(
)
_register_template(
register_template(
name="phi",
format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
@@ -1263,7 +1266,7 @@ _register_template(
)
_register_template(
register_template(
name="phi_small",
format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
@@ -1273,7 +1276,7 @@ _register_template(
)
_register_template(
register_template(
name="phi4",
format_user=StringFormatter(
slots=["<|im_start|>user<|im_sep|>{{content}}<|im_end|><|im_start|>assistant<|im_sep|>"]
@@ -1285,7 +1288,7 @@ _register_template(
# copied from ministral template
_register_template(
register_template(
name="pixtral",
format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
format_system=StringFormatter(slots=["{{content}}\n\n"]),
@@ -1299,7 +1302,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="qwen",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1315,7 +1318,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="qwen2_audio",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1327,7 +1330,7 @@ _register_template(
# copied from qwen template
_register_template(
register_template(
name="qwen2_vl",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1343,7 +1346,7 @@ _register_template(
)
_register_template(
register_template(
name="sailor",
format_user=StringFormatter(slots=["<|im_start|>question\n{{content}}<|im_end|>\n<|im_start|>answer\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1357,7 +1360,7 @@ _register_template(
# copied from llama3 template
_register_template(
register_template(
name="skywork_o1",
format_user=StringFormatter(
slots=[
@@ -1391,7 +1394,7 @@ _register_template(
)
_register_template(
register_template(
name="solar",
format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]),
@@ -1399,7 +1402,7 @@ _register_template(
)
_register_template(
register_template(
name="starchat",
format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>"]),
format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
@@ -1408,14 +1411,14 @@ _register_template(
)
_register_template(
register_template(
name="telechat",
format_user=StringFormatter(slots=["<_user>{{content}}<_bot>"]),
format_system=StringFormatter(slots=["<_system>{{content}}<_end>"]),
)
_register_template(
register_template(
name="telechat2",
format_user=StringFormatter(slots=["<_user>{{content}}<_bot>"]),
format_system=StringFormatter(slots=["<_system>{{content}}"]),
@@ -1425,7 +1428,7 @@ _register_template(
)
_register_template(
register_template(
name="vicuna",
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
default_system=(
@@ -1436,7 +1439,7 @@ _register_template(
)
_register_template(
register_template(
name="video_llava",
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
default_system=(
@@ -1447,7 +1450,7 @@ _register_template(
)
_register_template(
register_template(
name="xuanyuan",
format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
default_system=(
@@ -1458,13 +1461,13 @@ _register_template(
)
_register_template(
register_template(
name="xverse",
format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]),
)
_register_template(
register_template(
name="yayi",
format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
format_assistant=StringFormatter(slots=["{{content}}\n\n"]),
@@ -1485,7 +1488,7 @@ _register_template(
# copied from chatml template
_register_template(
register_template(
name="yi",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
@@ -1494,7 +1497,7 @@ _register_template(
)
_register_template(
register_template(
name="yi_vl",
format_user=StringFormatter(slots=["### Human: {{content}}\n### Assistant:"]),
format_assistant=StringFormatter(slots=["{{content}}\n"]),
@@ -1511,7 +1514,7 @@ _register_template(
)
_register_template(
register_template(
name="yuan",
format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
format_assistant=StringFormatter(slots=["{{content}}<eod>\n"]),
@@ -1519,7 +1522,7 @@ _register_template(
)
_register_template(
register_template(
name="zephyr",
format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>\n"]),
format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
@@ -1527,7 +1530,7 @@ _register_template(
)
_register_template(
register_template(
name="ziya",
format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
format_assistant=StringFormatter(slots=["{{content}}\n"]),