refactor data preprocessing, fix mllm rlhf

Former-commit-id: 53ff2dd24f9121ea30c95063bb72e49a9b31e980
This commit is contained in:
hiyouga
2024-05-24 04:08:25 +08:00
parent 1078611259
commit bf59383783
15 changed files with 572 additions and 464 deletions

View File

@@ -1,5 +1,6 @@
import inspect
import os
import sys
from typing import TYPE_CHECKING, Literal, Optional, Union
from datasets import load_dataset, load_from_disk
@@ -167,12 +168,15 @@ def get_dataset(
logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path))
logger.info("Please restart the training with `--tokenized_path {}`.".format(data_args.tokenized_path))
exit(0)
sys.exit(0)
if training_args.should_log:
try:
print_function(next(iter(dataset)))
except StopIteration:
raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
if stage == "pt":
raise RuntimeError("Cannot find sufficient samples, consider increasing dataset size.")
else:
raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
return dataset