improve fix tokenizer

Former-commit-id: 57b138abad6397596bc47be94e092e8fabedc06f
This commit is contained in:
hiyouga
2024-02-09 14:53:14 +08:00
parent 2c3ef480a6
commit b98a64448a
6 changed files with 105 additions and 71 deletions

View File

@@ -142,7 +142,7 @@ def get_dataset(
stage: Literal["pt", "sft", "rm", "ppo"],
# split: Optional[str] = "train", # TODO: add split
) -> Union["Dataset", "IterableDataset"]:
template = get_template_and_fix_tokenizer(data_args.template, tokenizer)
template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
if data_args.train_on_prompt and template.efficient_eos:
raise ValueError("Current template does not support `train_on_prompt`.")