Files
LLaMA-Factory/src/llmtuner/dsets/utils.py
hiyouga dd3f3e9749 support streaming data, fix #284 #274 #268
Former-commit-id: 819cc1353599e5fa45658bc56dd0dbe4b258b197
2023-07-31 23:33:00 +08:00

16 lines
547 B
Python

from typing import TYPE_CHECKING, Dict
if TYPE_CHECKING:
from datasets import Dataset
def split_dataset(dataset: "Dataset", dev_ratio: float, do_train: bool) -> Dict[str, "Dataset"]:
if do_train:
if dev_ratio > 1e-6: # Split the dataset
dataset = dataset.train_test_split(test_size=dev_ratio)
return {"train_dataset": dataset["train"], "eval_dataset": dataset["test"]}
else:
return {"train_dataset": dataset}
else: # do_eval or do_predict
return {"eval_dataset": dataset}