improve aligner

Former-commit-id: cc7296b92e10c24967fc753393275b71d300683f
This commit is contained in:
hiyouga
2024-02-10 16:39:19 +08:00
parent a41fa6e730
commit 1955a8ea5a
10 changed files with 80 additions and 64 deletions

View File

@@ -30,6 +30,7 @@ def load_single_dataset(
model_args: "ModelArguments",
data_args: "DataArguments",
):
logger.info("Loading dataset {}...".format(dataset_attr))
data_path, data_name, data_dir, data_files = None, None, None, None
if dataset_attr.load_from in ["hf_hub", "ms_hub"]:
data_path = dataset_attr.dataset_name
@@ -60,7 +61,7 @@ def load_single_dataset(
if data_path is None:
raise ValueError("File extension must be txt, csv, json or jsonl.")
checksum(data_files, dataset_attr.dataset_sha1)
checksum(data_files, dataset_attr.file_sha1)
else:
raise NotImplementedError
@@ -157,7 +158,7 @@ def get_dataset(
with training_args.main_process_first(desc="load dataset"):
all_datasets = []
for dataset_attr in get_dataset_list(data_args): # TODO: add split
for dataset_attr in get_dataset_list(data_args):
all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args))
dataset = merge_dataset(all_datasets, data_args, training_args)
@@ -185,6 +186,6 @@ def get_dataset(
try:
print_function(next(iter(dataset)))
except StopIteration:
raise RuntimeError("Empty dataset!")
raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
return dataset