|
|
|
|
@@ -198,7 +198,7 @@ class Llama2Template(Template):
|
|
|
|
|
templates: Dict[str, Template] = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def register_template(
|
|
|
|
|
def _register_template(
|
|
|
|
|
name: str,
|
|
|
|
|
format_user: Optional["Formatter"] = None,
|
|
|
|
|
format_assistant: Optional["Formatter"] = None,
|
|
|
|
|
@@ -236,29 +236,45 @@ def register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_template_and_fix_tokenizer(name: str, tokenizer: "PreTrainedTokenizer") -> Template:
|
|
|
|
|
if tokenizer.eos_token_id is None:
|
|
|
|
|
tokenizer.eos_token = "<|endoftext|>"
|
|
|
|
|
def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) -> None:
|
|
|
|
|
is_added = tokenizer.eos_token_id is None
|
|
|
|
|
is_oov = eos_token not in tokenizer.get_vocab()
|
|
|
|
|
tokenizer.add_special_tokens({"eos_token": eos_token})
|
|
|
|
|
|
|
|
|
|
if is_added:
|
|
|
|
|
logger.info("Add eos token: {}".format(tokenizer.eos_token))
|
|
|
|
|
else:
|
|
|
|
|
logger.info("Replace eos token: {}".format(tokenizer.eos_token))
|
|
|
|
|
|
|
|
|
|
if tokenizer.pad_token_id is None:
|
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
logger.info("Add pad token: {}".format(tokenizer.pad_token))
|
|
|
|
|
if is_oov:
|
|
|
|
|
logger.warning("New token is added, you must enable `resize_vocab` to activate it.")
|
|
|
|
|
|
|
|
|
|
if name is None: # for pre-training
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
template = templates.get(name, None)
|
|
|
|
|
assert template is not None, "Template {} does not exist.".format(name)
|
|
|
|
|
def get_template_and_fix_tokenizer(
|
|
|
|
|
tokenizer: "PreTrainedTokenizer",
|
|
|
|
|
name: Optional[str] = None,
|
|
|
|
|
) -> Template:
|
|
|
|
|
if name is None:
|
|
|
|
|
template = templates["vanilla"] # placeholder
|
|
|
|
|
else:
|
|
|
|
|
template = templates.get(name, None)
|
|
|
|
|
if templates is None:
|
|
|
|
|
raise ValueError("Template {} does not exist.".format(name))
|
|
|
|
|
|
|
|
|
|
stop_words = template.stop_words
|
|
|
|
|
if template.replace_eos:
|
|
|
|
|
if not stop_words:
|
|
|
|
|
raise ValueError("Stop words are required to replace the EOS token.")
|
|
|
|
|
|
|
|
|
|
tokenizer.eos_token = stop_words[0]
|
|
|
|
|
_add_or_replace_eos_token(tokenizer, eos_token=stop_words[0])
|
|
|
|
|
stop_words = stop_words[1:]
|
|
|
|
|
logger.info("Replace eos token: {}".format(tokenizer.eos_token))
|
|
|
|
|
|
|
|
|
|
if tokenizer.eos_token_id is None:
|
|
|
|
|
_add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>")
|
|
|
|
|
|
|
|
|
|
if tokenizer.pad_token_id is None:
|
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
logger.info("Add pad token: {}".format(tokenizer.pad_token))
|
|
|
|
|
|
|
|
|
|
if stop_words:
|
|
|
|
|
tokenizer.add_special_tokens(
|
|
|
|
|
@@ -269,7 +285,7 @@ def get_template_and_fix_tokenizer(name: str, tokenizer: "PreTrainedTokenizer")
|
|
|
|
|
return template
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="alpaca",
|
|
|
|
|
format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
|
|
|
|
|
format_separator=EmptyFormatter(slots=["\n\n"]),
|
|
|
|
|
@@ -279,7 +295,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="aquila",
|
|
|
|
|
format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]),
|
|
|
|
|
format_separator=EmptyFormatter(slots=["###"]),
|
|
|
|
|
@@ -292,21 +308,21 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="baichuan",
|
|
|
|
|
format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
|
|
|
|
|
efficient_eos=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="baichuan2",
|
|
|
|
|
format_user=StringFormatter(slots=[{"token": "<reserved_106>"}, "{{content}}", {"token": "<reserved_107>"}]),
|
|
|
|
|
efficient_eos=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="belle",
|
|
|
|
|
format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
|
|
|
|
|
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
|
|
|
|
@@ -315,13 +331,13 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="bluelm",
|
|
|
|
|
format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="chatglm2",
|
|
|
|
|
format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]),
|
|
|
|
|
format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
|
|
|
|
|
@@ -331,7 +347,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="chatglm3",
|
|
|
|
|
format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
|
|
|
|
|
format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
|
|
|
|
|
@@ -351,14 +367,25 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="chatml_de",
|
|
|
|
|
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
|
|
|
|
format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
|
|
|
|
|
format_separator=EmptyFormatter(slots=["\n"]),
|
|
|
|
|
default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
|
|
|
|
|
stop_words=["<|im_end|>"],
|
|
|
|
|
replace_eos=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_register_template(
|
|
|
|
|
name="codegeex2",
|
|
|
|
|
format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
|
|
|
|
|
force_system=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="cpm",
|
|
|
|
|
format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
|
|
|
|
|
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
|
|
|
|
@@ -366,7 +393,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="deepseek",
|
|
|
|
|
format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
|
|
|
|
|
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
|
|
|
|
@@ -374,7 +401,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="deepseekcoder",
|
|
|
|
|
format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
|
|
|
|
|
format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
|
|
|
|
|
@@ -390,14 +417,14 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="default",
|
|
|
|
|
format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]),
|
|
|
|
|
format_separator=EmptyFormatter(slots=["\n"]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="falcon",
|
|
|
|
|
format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
|
|
|
|
|
format_separator=EmptyFormatter(slots=["\n"]),
|
|
|
|
|
@@ -405,7 +432,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="intern",
|
|
|
|
|
format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]),
|
|
|
|
|
format_separator=EmptyFormatter(slots=[{"token": "<eoa>"}, "\n"]),
|
|
|
|
|
@@ -414,7 +441,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="intern2",
|
|
|
|
|
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
|
|
|
|
format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]),
|
|
|
|
|
@@ -431,7 +458,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="llama2",
|
|
|
|
|
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
|
|
|
|
|
format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
|
|
|
|
|
@@ -448,7 +475,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="llama2_zh",
|
|
|
|
|
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
|
|
|
|
|
format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
|
|
|
|
|
@@ -456,7 +483,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="mistral",
|
|
|
|
|
format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
|
|
|
|
|
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
|
|
|
|
@@ -464,7 +491,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="openchat",
|
|
|
|
|
format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
|
|
|
|
|
format_assistant=StringFormatter(slots=["{{content}}"]),
|
|
|
|
|
@@ -473,7 +500,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="orion",
|
|
|
|
|
format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
|
|
|
|
|
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
|
|
|
|
@@ -481,7 +508,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="qwen",
|
|
|
|
|
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
|
|
|
|
format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
|
|
|
|
|
@@ -492,7 +519,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="solar",
|
|
|
|
|
format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
|
|
|
|
|
format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]),
|
|
|
|
|
@@ -500,7 +527,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="starchat",
|
|
|
|
|
format_user=StringFormatter(
|
|
|
|
|
slots=[{"token": "<|user|>"}, "\n{{content}}", {"token": "<|end|>"}, "\n", {"token": "<|assistant|>"}]
|
|
|
|
|
@@ -513,20 +540,12 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(name="vanilla")
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
name="chatml_de",
|
|
|
|
|
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
|
|
|
|
format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
|
|
|
|
|
format_separator=EmptyFormatter(slots=["\n"]),
|
|
|
|
|
default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
|
|
|
|
|
stop_words=["<|im_end|>"],
|
|
|
|
|
replace_eos=True,
|
|
|
|
|
_register_template(
|
|
|
|
|
name="vanilla",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="vicuna",
|
|
|
|
|
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
|
|
|
|
|
default_system=(
|
|
|
|
|
@@ -536,7 +555,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="xuanyuan",
|
|
|
|
|
format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
|
|
|
|
|
default_system=(
|
|
|
|
|
@@ -547,10 +566,13 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(name="xverse", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]))
|
|
|
|
|
_register_template(
|
|
|
|
|
name="xverse",
|
|
|
|
|
format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="yayi",
|
|
|
|
|
format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
|
|
|
|
|
format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]),
|
|
|
|
|
@@ -570,7 +592,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="yi",
|
|
|
|
|
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
|
|
|
|
format_separator=EmptyFormatter(slots=["\n"]),
|
|
|
|
|
@@ -579,7 +601,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="yuan",
|
|
|
|
|
format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
|
|
|
|
|
format_separator=EmptyFormatter(slots=["\n"]),
|
|
|
|
|
@@ -588,7 +610,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="zephyr",
|
|
|
|
|
format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]),
|
|
|
|
|
format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
|
|
|
|
|
@@ -596,7 +618,7 @@ register_template(
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
register_template(
|
|
|
|
|
_register_template(
|
|
|
|
|
name="ziya",
|
|
|
|
|
format_user=StringFormatter(slots=[{"token": "<human>"}, ":{{content}}\n", {"token": "<bot>"}, ":"]),
|
|
|
|
|
format_separator=EmptyFormatter(slots=["\n"]),
|
|
|
|
|
|