support dataset cache

Former-commit-id: f79ee62eb4a2a4a01cb4e2a6aa2d07158cf8eb59
2023-10-26 21:48:45 +08:00
parent 67a46e553f
commit c762168ed0
2 changed files with 26 additions and 3 deletions
--- a/src/llmtuner/hparams/data_args.py
+++ b/src/llmtuner/hparams/data_args.py
@@ -98,6 +98,10 @@ class DataArguments:
        default=False,
        metadata={"help": "Packing the questions and answers in the supervised fine-tuning stage."}
    )
+    cache_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to save or load the preprocessed datasets."}
+    )

    def __post_init__(self):
        if self.streaming and self.val_size > 1e-6 and self.val_size < 1:
@@ -106,6 +110,9 @@ class DataArguments:
        if self.streaming and self.max_samples is not None:
            raise ValueError("`max_samples` is incompatible with `streaming`.")

+        if self.streaming and self.cache_path:
+            raise ValueError("`cache_path` is incompatible with `streaming`.")
+
    def init_for_training(self, seed: int): # support mixing multiple datasets
        self.seed = seed
        dataset_names = [ds.strip() for ds in self.dataset.split(",")] if self.dataset is not None else []