Initial commit

Former-commit-id: 5ca8e1d63727e7bcb8cab16542c763c47e48184a
2023-05-28 18:09:04 +08:00
commit 17024ebc1a
29 changed files with 2399 additions and 0 deletions
--- a/data/README.md
+++ b/data/README.md
@@ -0,0 +1,53 @@
+Data format in `dataset_info.json`:
+```json
+"dataset_name": {
+    "hf_hub_url": "the name of the dataset repository on the HuggingFace hub. (if specified, ignore below 3 arguments)",
+    "script_url": "the name of the directory containing a dataset loading script. (if specified, ignore below 2 arguments)",
+    "file_name": "the name of the dataset file in the this directory. (required if above are not specified)",
+    "file_sha1": "the SHA-1 hash value of the dataset file. (optional)",
+    "columns": {
+        "prompt": "the name of the column in the datasets containing the prompts. (default: instruction)",
+        "query": "the name of the column in the datasets containing the queries. (default: input)",
+        "response": "the name of the column in the datasets containing the responses. (default: output)",
+        "history": "the name of the column in the datasets containing the history of chat. (default: None)"
+    }
+}
+```
+
+`dataset_info.json` 中的数据集定义格式：
+```json
+"数据集名称": {
+    "hf_hub_url": "HuggingFace上的项目地址（若指定，则忽略下列三个参数）",
+    "script_url": "包含数据加载脚本的本地文件夹名称（若指定，则忽略下列两个参数）",
+    "file_name": "该目录下数据集文件的名称（若上述参数未指定，则此项必需）",
+    "file_sha1": "数据集文件的SHA-1哈希值（可选）",
+    "columns": {
+        "prompt": "数据集代表提示词的表头名称（默认：instruction）",
+        "query": "数据集代表请求的表头名称（默认：input）",
+        "response": "数据集代表回答的表头名称（默认：output）",
+        "history": "数据集代表历史对话的表头名称（默认：None）"
+    }
+}
+```
+
+部分预置数据集简介：
+
+| 数据集名称 | 规模 | 描述 |
+| --- | --- | --- |
+| [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) | 52k | 斯坦福大学开源的 Alpaca 数据集，训练了 Alpaca 这类早期基于 LLaMA 的模型 |
+| [Stanford Alpaca (Chinese)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) | 51k | 使用 ChatGPT 翻译的 Alpaca 数据集 |
+| [GPT-4 Generated Data](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) | 100k+ | 基于 GPT-4 的 self-instruction 数据集 |
+| [BELLE 2M](https://huggingface.co/datasets/BelleGroup/train_2M_CN) | 2m | 包含约 200 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的中文指令数据 |
+| [BELLE 1M](https://huggingface.co/datasets/BelleGroup/train_1M_CN) | 1m | 包含约 100 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的中文指令数据 |
+| [BELLE 0.5M](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN) | 500k  | 包含约 50 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的中文指令数据 |
+| [BELLE Dialogue 0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M) | 400k | 包含约 40 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的个性化角色对话数据，包含角色介绍 |
+| [BELLE School Math 0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) | 250k  | 包含约 25 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的中文数学题数据，包含解题过程 |
+| [BELLE Multiturn Chat 0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) | 800k | 包含约 80 万条由 [BELLE](https://github.com/LianjiaTech/BELLE) 项目生成的用户与助手的多轮对话 |
+| [Guanaco Dataset](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset) | 100k+ | 包含日文、简繁体中文、英文等多类数据，数据集原用于 Guanaco 模型训练 |
+| [Firefly 1.1M](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) | 1.1M  | 中文对话大模型 firefly（流萤）的中文数据集，包含多个 NLP 任务 |
+| [CodeAlpaca 20k](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k) | 20k | 英文代码生成任务数据集 |
+| [Alpaca CoT](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT) | 6M | 用于微调的指令数据集集合 |
+| [Web QA](https://huggingface.co/datasets/suolyer/webqa) | 36k | 百度知道汇集的中文问答数据集 |
+| [UltraChat](https://github.com/thunlp/UltraChat) | 1.57M | 清华 NLP 发布的大规模多轮对话数据集 |
+
+注：BELLE 数据集是由 ChatGPT 产生的数据集，不保证数据准确性，所有类 GPT 模型产生的 self-instruction 数据集均不能保证其准确性。
--- a/data/alpaca_data_en_52k.json.REMOVED.git-id
+++ b/data/alpaca_data_en_52k.json.REMOVED.git-id
@@ -0,0 +1 @@
+3779ddbc040543ab1834ef216c983d6fcc06cc9a
--- a/data/alpaca_data_zh_51k.json.REMOVED.git-id
+++ b/data/alpaca_data_zh_51k.json.REMOVED.git-id
@@ -0,0 +1 @@
+fc9a6a3458caca2af8dafc6181773fe10c6d8657
--- a/data/alpaca_gpt4_data_en.json.REMOVED.git-id
+++ b/data/alpaca_gpt4_data_en.json.REMOVED.git-id
@@ -0,0 +1 @@
+25508714b7879a1e5a6764ba7f979a980f549f1a
--- a/data/alpaca_gpt4_data_zh.json.REMOVED.git-id
+++ b/data/alpaca_gpt4_data_zh.json.REMOVED.git-id
@@ -0,0 +1 @@
+7cb6a7d11455bddc3d495750a2392683d775b184
--- a/data/comparison_gpt4_data_en.json.REMOVED.git-id
+++ b/data/comparison_gpt4_data_en.json.REMOVED.git-id
@@ -0,0 +1 @@
+f437d58b7791609ee91f064551c5c5734a0fd97a
--- a/data/comparison_gpt4_data_zh.json.REMOVED.git-id
+++ b/data/comparison_gpt4_data_zh.json.REMOVED.git-id
@@ -0,0 +1 @@
+0e346cf70e633456c7e83f68765361016005447a
--- a/data/example_dataset/example_dataset.py
+++ b/data/example_dataset/example_dataset.py
@@ -0,0 +1,46 @@
+import json
+import datasets
+from typing import Any, Dict, List
+
+
+_DESCRIPTION = "An example of dataset for LLaMA."
+_CITATION = ""
+_HOMEPAGE = ""
+_LICENSE = ""
+_URL = "examples.json"
+
+
+class ExampleDataset(datasets.GeneratorBasedBuilder):
+
+    VERSION = datasets.Version("0.0.0")
+
+    def _info(self) -> datasets.DatasetInfo:
+        features = datasets.Features({
+            "instruction": datasets.Value("string"),
+            "input": datasets.Value("string"),
+            "output": datasets.Value("string"),
+            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
+        })
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        file_path = dl_manager.download(_URL)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": file_path
+                }
+            )
+        ]
+
+    def _generate_examples(self, filepath: str) -> Dict[int, Dict[str, Any]]:
+        example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
+        for key, example in enumerate(example_dataset):
+            yield key, example
--- a/data/hh_rlhf_en/hh_rlhf_en.py
+++ b/data/hh_rlhf_en/hh_rlhf_en.py
@@ -0,0 +1,97 @@
+import json
+import datasets
+from typing import Any, Dict, List
+
+
+_DESCRIPTION = "Human preference data about helpfulness and harmlessness for ChatGLM."
+_CITATION = ""
+_HOMEPAGE = "https://huggingface.co/datasets/Anthropic/hh-rlhf"
+_LICENSE = "mit"
+_URL = "https://huggingface.co/datasets/Anthropic/hh-rlhf/resolve/main/"
+_URLS = {
+    "train": [
+        _URL + "harmless-base/train.jsonl.gz",
+        _URL + "helpful-base/train.jsonl.gz",
+        _URL + "helpful-online/train.jsonl.gz",
+        _URL + "helpful-rejection-sampled/train.jsonl.gz"
+    ],
+    "test": [
+        _URL + "harmless-base/test.jsonl.gz",
+        _URL + "helpful-base/test.jsonl.gz",
+        _URL + "helpful-online/test.jsonl.gz",
+        _URL + "helpful-rejection-sampled/test.jsonl.gz"
+    ]
+}
+
+
+class HhRlhfEn(datasets.GeneratorBasedBuilder):
+
+    VERSION = datasets.Version("0.0.0")
+
+    def _info(self) -> datasets.DatasetInfo:
+        features = datasets.Features({
+            "instruction": datasets.Value("string"),
+            "output": datasets.Sequence(datasets.Value("string")),
+            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
+        })
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        file_path = dl_manager.download_and_extract(_URLS)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepaths": file_path["train"]
+                }
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepaths": file_path["test"]
+                }
+            )
+        ]
+
+    def _generate_examples(self, filepaths: List[str]) -> Dict[int, Dict[str, Any]]: # generate multi-turn chat for ChatGLM
+        key = 0
+        for filepath in filepaths:
+            with open(filepath, "r", encoding="utf-8") as f:
+                for row in f:
+                    data = json.loads(row)
+                    chosen = data["chosen"]
+                    rejected = data["rejected"]
+
+                    assist_idx = rejected.rfind("\n\nAssistant: ")
+                    r_reject = rejected[assist_idx+13:].strip()
+                    assist_idx = chosen.rfind("\n\nAssistant: ")
+                    r_accept = chosen[assist_idx+13:].strip()
+
+                    human_idx = chosen.rfind("\n\nHuman: ")
+                    query = chosen[human_idx+9:assist_idx].strip()
+                    prompt = chosen[:human_idx]
+                    history = []
+
+                    while prompt.rfind("\n\nAssistant: ") != -1:
+                        assist_idx = prompt.rfind("\n\nAssistant: ")
+                        human_idx = prompt.rfind("\n\nHuman: ")
+                        if human_idx != -1:
+                            old_query = prompt[human_idx+9:assist_idx].strip()
+                            old_resp = prompt[assist_idx+13:].strip()
+                            history.insert(0, (old_query, old_resp))
+                        else:
+                            break
+                        prompt = prompt[:human_idx]
+
+                    yield key, {
+                        "instruction": query,
+                        "output": [r_accept, r_reject],
+                        "history": history
+                    }
+                    key += 1
--- a/data/ultra_chat/ultra_chat.py
+++ b/data/ultra_chat/ultra_chat.py
@@ -0,0 +1,76 @@
+import json
+import datasets
+from typing import Any, Dict, List
+
+
+_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
+
+_CITATION = """\
+@misc{UltraChat,
+  author = {Ding, Ning and Chen, Yulin and Xu, Bokai and Hu, Shengding and Qin, Yujia and Liu, Zhiyuan and Sun, Maosong and Zhou, Bowen},
+  title = {UltraChat: A Large-scale Auto-generated Multi-round Dialogue Data},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\\url{https://github.com/thunlp/ultrachat}},
+}
+"""
+
+_HOMEPAGE = "https://huggingface.co/datasets/stingning/ultrachat"
+_LICENSE = "cc-by-nc-4.0"
+_BASE_DATA_URL = "https://huggingface.co/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl"
+
+
+class BelleMultiturn(datasets.GeneratorBasedBuilder):
+
+    VERSION = datasets.Version("0.0.0")
+
+    def _info(self) -> datasets.DatasetInfo:
+        features = datasets.Features({
+            "instruction": datasets.Value("string"),
+            "output": datasets.Value("string"),
+            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
+        })
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(9)] # multiple shards
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepaths": file_paths
+                }
+            )
+        ]
+
+    def _generate_examples(self, filepaths: List[str]) -> Dict[int, Dict[str, Any]]: # generate multi-turn chat for ChatGLM
+        for filepath in filepaths:
+            with open(filepath, "r", encoding="utf-8") as f:
+                for row in f:
+                    try:
+                        data = json.loads(row)
+                    except:
+                        continue
+                    key = data["id"]
+                    content = data["data"]
+                    if len(content) % 2 == 1:
+                        content.pop(-1)
+                    if len(content) < 2:
+                        continue
+
+                    query = content[-2]
+                    response = content[-1]
+                    history = [[content[2*i], content[2*i+1]] for i in range(len(content) // 2 - 1)]
+
+                    yield key, {
+                        "instruction": query,
+                        "output": response,
+                        "history": history
+                    }