From e70651ac58af6c3446741e8f6b6d4d0d913248c5 Mon Sep 17 00:00:00 2001 From: Meng WANG <49304833+Moenupa@users.noreply.github.com> Date: Tue, 20 Jan 2026 15:54:07 +0800 Subject: [PATCH] [feat] support `all_exhausted_without_replacement` in datasets.interleave_datasets (#10112) --- src/llamafactory/data/data_utils.py | 8 +++++++- src/llamafactory/hparams/data_args.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/llamafactory/data/data_utils.py b/src/llamafactory/data/data_utils.py index 139425a22..8fcd9554f 100644 --- a/src/llamafactory/data/data_utils.py +++ b/src/llamafactory/data/data_utils.py @@ -65,11 +65,17 @@ def merge_dataset( if not data_args.streaming: logger.warning_rank0_once("We recommend using `mix_strategy=concat` in non-streaming mode.") + strategy_map: str = { + "interleave_under": "first_exhausted", + "interleave_over": "all_exhausted", + "interleave_once": "all_exhausted_without_replacement", + }[data_args.mix_strategy] + return interleave_datasets( datasets=all_datasets, probabilities=data_args.interleave_probs, seed=seed, - stopping_strategy="first_exhausted" if data_args.mix_strategy.endswith("under") else "all_exhausted", + stopping_strategy=strategy_map, # type: ignore ) else: diff --git a/src/llamafactory/hparams/data_args.py b/src/llamafactory/hparams/data_args.py index 921019a02..49d75cf08 100644 --- a/src/llamafactory/hparams/data_args.py +++ b/src/llamafactory/hparams/data_args.py @@ -63,9 +63,9 @@ class DataArguments: default=16384, metadata={"help": "Size of the buffer to randomly sample examples from in dataset streaming."}, ) - mix_strategy: Literal["concat", "interleave_under", "interleave_over"] = field( + mix_strategy: Literal["concat", "interleave_under", "interleave_over", "interleave_once"] = field( default="concat", - metadata={"help": "Strategy to use in dataset mixing (concat/interleave) (undersampling/oversampling)."}, + metadata={"help": "Strategy to use in dataset mixing (concat/interleave) (undersampling/oversampling/sampling w.o. replacement)."}, ) interleave_probs: str | None = field( default=None,