add MMLU and C-Eval script

Former-commit-id: 3403f876127b4b99c5e3edb2834cc3b9a3a0063f
2023-09-23 00:34:17 +08:00
parent 4fbdf18c70
commit 35d1921081
12 changed files with 579 additions and 856 deletions
--- a/evaluation/mmlu/mmlu.py
+++ b/evaluation/mmlu/mmlu.py
@@ -0,0 +1,167 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import datasets
+import pandas as pd
+
+
+_CITATION = """\
+@article{hendryckstest2021,
+  title={Measuring Massive Multitask Language Understanding},
+  author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+  year={2021}
+}
+"""
+
+_DESCRIPTION = """\
+Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).
+"""
+
+_HOMEPAGE = "https://github.com/hendrycks/test"
+
+_LICENSE = "MIT"
+
+_URL = "mmlu.zip"
+
+task_list = [
+    "high_school_european_history",
+    "business_ethics",
+    "clinical_knowledge",
+    "medical_genetics",
+    "high_school_us_history",
+    "high_school_physics",
+    "high_school_world_history",
+    "virology",
+    "high_school_microeconomics",
+    "econometrics",
+    "college_computer_science",
+    "high_school_biology",
+    "abstract_algebra",
+    "professional_accounting",
+    "philosophy",
+    "professional_medicine",
+    "nutrition",
+    "global_facts",
+    "machine_learning",
+    "security_studies",
+    "public_relations",
+    "professional_psychology",
+    "prehistory",
+    "anatomy",
+    "human_sexuality",
+    "college_medicine",
+    "high_school_government_and_politics",
+    "college_chemistry",
+    "logical_fallacies",
+    "high_school_geography",
+    "elementary_mathematics",
+    "human_aging",
+    "college_mathematics",
+    "high_school_psychology",
+    "formal_logic",
+    "high_school_statistics",
+    "international_law",
+    "high_school_mathematics",
+    "high_school_computer_science",
+    "conceptual_physics",
+    "miscellaneous",
+    "high_school_chemistry",
+    "marketing",
+    "professional_law",
+    "management",
+    "college_physics",
+    "jurisprudence",
+    "world_religions",
+    "sociology",
+    "us_foreign_policy",
+    "high_school_macroeconomics",
+    "computer_security",
+    "moral_scenarios",
+    "moral_disputes",
+    "electrical_engineering",
+    "astronomy",
+    "college_biology",
+]
+
+
+class MMLUConfig(datasets.BuilderConfig):
+    def __init__(self, **kwargs):
+        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
+
+
+class MMLU(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        MMLUConfig(
+            name=task_name,
+        )
+        for task_name in task_list
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "question": datasets.Value("string"),
+                "A": datasets.Value("string"),
+                "B": datasets.Value("string"),
+                "C": datasets.Value("string"),
+                "D": datasets.Value("string"),
+                "answer": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        data_dir = dl_manager.download_and_extract(_URL)
+        task_name = self.config.name
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, "data", "test", f"{task_name}_test.csv"
+                    ),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, "data", "val", f"{task_name}_val.csv"
+                    ),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, "data", "dev", f"{task_name}_dev.csv"
+                    ),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        df = pd.read_csv(filepath)
+        df.columns = ["question", "A", "B", "C", "D", "answer"]
+
+        for i, instance in enumerate(df.to_dict(orient="records")):
+            yield i, instance