Merge branch 'main' into add_dataset_sample_num

Former-commit-id: 26300127c45f24e63b91f1b0cc73e46c3a936a91
This commit is contained in:
seanzhang-zhichen
2024-05-24 15:57:47 +08:00
committed by GitHub
27 changed files with 756 additions and 513 deletions

View File

@@ -2,6 +2,7 @@ import inspect
import os
import numpy as np
from numpy.random import RandomState
import sys
from typing import TYPE_CHECKING, Literal, Optional, Union
from datasets import load_dataset, load_from_disk
@@ -180,12 +181,15 @@ def get_dataset(
logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path))
logger.info("Please restart the training with `--tokenized_path {}`.".format(data_args.tokenized_path))
exit(0)
sys.exit(0)
if training_args.should_log:
try:
print_function(next(iter(dataset)))
except StopIteration:
raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
if stage == "pt":
raise RuntimeError("Cannot find sufficient samples, consider increasing dataset size.")
else:
raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
return dataset