Former-commit-id: a4149fbcd600d4f3815f9353e5e92c569719bed6
This commit is contained in:
hiyouga
2024-01-21 00:03:09 +08:00
parent 5c9815ef6f
commit 50459a39f4
5 changed files with 60 additions and 50 deletions

View File

@@ -2,7 +2,7 @@ import json
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any, Dict, List, Literal, Set, Sequence, Tuple, Union
from typing import Any, Dict, List, Literal, Sequence, Set, Tuple, Union
SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]

View File

@@ -144,8 +144,8 @@ class Template:
max_len=(cutoff_len - total_length),
reserved_label_len=reserved_label_len,
)
encoded_messages[i] = encoded_messages[i][: max_source_len]
encoded_messages[i + 1] = encoded_messages[i + 1][: max_target_len]
encoded_messages[i] = encoded_messages[i][:max_source_len]
encoded_messages[i + 1] = encoded_messages[i + 1][:max_target_len]
total_length += len(encoded_messages[i]) + len(encoded_messages[i + 1])
encoded_pairs.append((encoded_messages[i], encoded_messages[i + 1]))
@@ -416,7 +416,7 @@ register_template(
"by the user such as English and 中文."
),
stop_words=["<|im_end|>"],
efficient_eos=True,
efficient_eos=True, # internlm2 tokenizer cannot set eos_token_id
)
@@ -455,9 +455,7 @@ register_template(
register_template(
name="openchat",
format_user=StringFormatter(
slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]
),
format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
format_assistant=StringFormatter(slots=["{{content}}"]),
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
force_system=True,