Former-commit-id: 4e1af5a5d39d9e2f374c1372e2d67120c63fea09
This commit is contained in:
hiyouga
2023-12-09 20:53:18 +08:00
parent 2e6ed731cf
commit f3ffa8310f
4 changed files with 18 additions and 13 deletions

View File

@@ -4,9 +4,10 @@ If you are using a custom dataset, please provide your dataset definition in the
"dataset_name": {
"hf_hub_url": "the name of the dataset repository on the Hugging Face hub. (if specified, ignore below 3 arguments)",
"script_url": "the name of the directory containing a dataset loading script. (if specified, ignore below 2 arguments)",
"file_name": "the name of the dataset file in the this directory. (required if above are not specified)",
"file_name": "the name of the dataset file in this directory. (required if above are not specified)",
"file_sha1": "the SHA-1 hash value of the dataset file. (optional, does not affect training)",
"subset": "the name of the subset. (optional, default: None)",
"folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
"ranking": "whether the dataset is a preference dataset or not. (default: false)",
"formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})",
"columns": {

View File

@@ -2,11 +2,12 @@
```json
"数据集名称": {
"hf_hub_url": "Hugging Face 上的项目地址(若指定,则忽略下列三个参数)",
"hf_hub_url": "Hugging Face 的仓库地址(若指定,则忽略下列三个参数)",
"script_url": "包含数据加载脚本的本地文件夹名称(若指定,则忽略下列两个参数)",
"file_name": "该目录下数据集文件的名称(若上述参数未指定,则此项必需)",
"file_sha1": "数据集文件的SHA-1哈希值可选留空不影响训练",
"file_sha1": "数据集文件的 SHA-1 哈希值(可选,留空不影响训练)",
"subset": "数据集子集的名称可选默认None",
"folder": "Hugging Face 仓库的文件夹名称可选默认None",
"ranking": "是否为偏好数据集可选默认False",
"formatting": "数据集格式可选默认alpaca可以为 alpaca 或 sharegpt",
"columns": {