improve aligner

Former-commit-id: cc7296b92e10c24967fc753393275b71d300683f
2024-02-10 16:39:19 +08:00
parent a41fa6e730
commit 1955a8ea5a
10 changed files with 80 additions and 64 deletions
--- a/data/README.md
+++ b/data/README.md
@@ -11,7 +11,7 @@ If you are using a custom dataset, please provide your dataset definition in the
  "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
  "ranking": "whether the dataset is a preference dataset or not. (default: false)",
  "formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})",
-  "columns": {
+  "columns (optional)": {
    "prompt": "the column name in the dataset containing the prompts. (default: instruction)",
    "query": "the column name in the dataset containing the queries. (default: input)",
    "response": "the column name in the dataset containing the responses. (default: output)",
@@ -20,14 +20,14 @@ If you are using a custom dataset, please provide your dataset definition in the
    "system": "the column name in the dataset containing the system prompts. (default: None)",
    "tools": "the column name in the dataset containing the tool description. (default: None)"
  },
-  "tags": {
+  "tags (optional, used for the sharegpt format)": {
    "role_tag": "the key in the message represents the identity. (default: from)",
    "content_tag": "the key in the message represents the content. (default: value)",
    "user_tag": "the value of the role_tag represents the user. (default: human)",
    "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)",
    "observation_tag": "the value of the role_tag represents the tool results. (default: observation)",
    "function_tag": "the value of the role_tag represents the function call. (default: function_call)",
-    "system_tag": "the value of the role_tag represents the system prompt. (default: None) incompatible with system column"
+    "system_tag": "the value of the role_tag represents the system prompt. (default: system, can override system column)"
  }
 }
 ```
--- a/data/README_zh.md
+++ b/data/README_zh.md
@@ -11,7 +11,7 @@
  "folder": "Hugging Face 仓库的文件夹名称（可选，默认：None）",
  "ranking": "是否为偏好数据集（可选，默认：False）",
  "formatting": "数据集格式（可选，默认：alpaca，可以为 alpaca 或 sharegpt）",
-  "columns": {
+  "columns（可选）": {
    "prompt": "数据集代表提示词的表头名称（默认：instruction）",
    "query": "数据集代表请求的表头名称（默认：input）",
    "response": "数据集代表回答的表头名称（默认：output）",
@@ -20,13 +20,14 @@
    "system": "数据集代表系统提示的表头名称（默认：None）",
    "tools": "数据集代表工具描述的表头名称（默认：None）"
  },
-  "tags": {
+  "tags（可选，用于 sharegpt 格式）": {
    "role_tag": "消息中代表发送者身份的键名（默认：from）",
    "content_tag": "消息中代表文本内容的键名（默认：value）",
    "user_tag": "消息中代表用户的 role_tag（默认：human）",
    "assistant_tag": "消息中代表助手的 role_tag（默认：gpt）",
    "observation_tag": "消息中代表工具返回结果的 role_tag（默认：observation）",
-    "function_tag": "消息中代表工具调用的 role_tag（默认：function_call）"
+    "function_tag": "消息中代表工具调用的 role_tag（默认：function_call）",
+    "system_tag": "消息中代表系统提示的 role_tag（默认：system，会覆盖 system 列）"
  }
 }
 ```