add MMLU and C-Eval script
Former-commit-id: 3403f876127b4b99c5e3edb2834cc3b9a3a0063f
This commit is contained in:
167
evaluation/mmlu/mmlu.py
Normal file
167
evaluation/mmlu/mmlu.py
Normal file
@@ -0,0 +1,167 @@
|
||||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
|
||||
import datasets
|
||||
import pandas as pd
|
||||
|
||||
|
||||
_CITATION = """\
|
||||
@article{hendryckstest2021,
|
||||
title={Measuring Massive Multitask Language Understanding},
|
||||
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
||||
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
||||
year={2021}
|
||||
}
|
||||
"""
|
||||
|
||||
_DESCRIPTION = """\
|
||||
Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).
|
||||
"""
|
||||
|
||||
_HOMEPAGE = "https://github.com/hendrycks/test"
|
||||
|
||||
_LICENSE = "MIT"
|
||||
|
||||
_URL = "mmlu.zip"
|
||||
|
||||
task_list = [
|
||||
"high_school_european_history",
|
||||
"business_ethics",
|
||||
"clinical_knowledge",
|
||||
"medical_genetics",
|
||||
"high_school_us_history",
|
||||
"high_school_physics",
|
||||
"high_school_world_history",
|
||||
"virology",
|
||||
"high_school_microeconomics",
|
||||
"econometrics",
|
||||
"college_computer_science",
|
||||
"high_school_biology",
|
||||
"abstract_algebra",
|
||||
"professional_accounting",
|
||||
"philosophy",
|
||||
"professional_medicine",
|
||||
"nutrition",
|
||||
"global_facts",
|
||||
"machine_learning",
|
||||
"security_studies",
|
||||
"public_relations",
|
||||
"professional_psychology",
|
||||
"prehistory",
|
||||
"anatomy",
|
||||
"human_sexuality",
|
||||
"college_medicine",
|
||||
"high_school_government_and_politics",
|
||||
"college_chemistry",
|
||||
"logical_fallacies",
|
||||
"high_school_geography",
|
||||
"elementary_mathematics",
|
||||
"human_aging",
|
||||
"college_mathematics",
|
||||
"high_school_psychology",
|
||||
"formal_logic",
|
||||
"high_school_statistics",
|
||||
"international_law",
|
||||
"high_school_mathematics",
|
||||
"high_school_computer_science",
|
||||
"conceptual_physics",
|
||||
"miscellaneous",
|
||||
"high_school_chemistry",
|
||||
"marketing",
|
||||
"professional_law",
|
||||
"management",
|
||||
"college_physics",
|
||||
"jurisprudence",
|
||||
"world_religions",
|
||||
"sociology",
|
||||
"us_foreign_policy",
|
||||
"high_school_macroeconomics",
|
||||
"computer_security",
|
||||
"moral_scenarios",
|
||||
"moral_disputes",
|
||||
"electrical_engineering",
|
||||
"astronomy",
|
||||
"college_biology",
|
||||
]
|
||||
|
||||
|
||||
class MMLUConfig(datasets.BuilderConfig):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(version=datasets.Version("1.0.0"), **kwargs)
|
||||
|
||||
|
||||
class MMLU(datasets.GeneratorBasedBuilder):
|
||||
BUILDER_CONFIGS = [
|
||||
MMLUConfig(
|
||||
name=task_name,
|
||||
)
|
||||
for task_name in task_list
|
||||
]
|
||||
|
||||
def _info(self):
|
||||
features = datasets.Features(
|
||||
{
|
||||
"question": datasets.Value("string"),
|
||||
"A": datasets.Value("string"),
|
||||
"B": datasets.Value("string"),
|
||||
"C": datasets.Value("string"),
|
||||
"D": datasets.Value("string"),
|
||||
"answer": datasets.Value("string"),
|
||||
}
|
||||
)
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=features,
|
||||
homepage=_HOMEPAGE,
|
||||
license=_LICENSE,
|
||||
citation=_CITATION,
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager):
|
||||
data_dir = dl_manager.download_and_extract(_URL)
|
||||
task_name = self.config.name
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TEST,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(
|
||||
data_dir, "data", "test", f"{task_name}_test.csv"
|
||||
),
|
||||
},
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.VALIDATION,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(
|
||||
data_dir, "data", "val", f"{task_name}_val.csv"
|
||||
),
|
||||
},
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(
|
||||
data_dir, "data", "dev", f"{task_name}_dev.csv"
|
||||
),
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
def _generate_examples(self, filepath):
|
||||
df = pd.read_csv(filepath)
|
||||
df.columns = ["question", "A", "B", "C", "D", "answer"]
|
||||
|
||||
for i, instance in enumerate(df.to_dict(orient="records")):
|
||||
yield i, instance
|
||||
Reference in New Issue
Block a user