support dataset cache

Former-commit-id: f79ee62eb4a2a4a01cb4e2a6aa2d07158cf8eb59
This commit is contained in:
hiyouga
2023-10-26 21:48:45 +08:00
parent 67a46e553f
commit c762168ed0
2 changed files with 26 additions and 3 deletions

View File

@@ -98,6 +98,10 @@ class DataArguments:
default=False,
metadata={"help": "Packing the questions and answers in the supervised fine-tuning stage."}
)
cache_path: Optional[str] = field(
default=None,
metadata={"help": "Path to save or load the preprocessed datasets."}
)
def __post_init__(self):
if self.streaming and self.val_size > 1e-6 and self.val_size < 1:
@@ -106,6 +110,9 @@ class DataArguments:
if self.streaming and self.max_samples is not None:
raise ValueError("`max_samples` is incompatible with `streaming`.")
if self.streaming and self.cache_path:
raise ValueError("`cache_path` is incompatible with `streaming`.")
def init_for_training(self, seed: int): # support mixing multiple datasets
self.seed = seed
dataset_names = [ds.strip() for ds in self.dataset.split(",")] if self.dataset is not None else []