[v1] add data converter (#9263)

2026-03-25 01:13:07 +00:00 · 2025-10-13 15:54:47 +08:00
parent 48974783da
commit 52e46e162e
7 changed files with 266 additions and 62 deletions
--- a/src/llamafactory/v1/plugins/data_plugins/converter.py
+++ b/src/llamafactory/v1/plugins/data_plugins/converter.py
@@ -0,0 +1,71 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, NotRequired, TypedDict
+
+from ...extras.types import Sample, SFTSample
+
+
+class AlpacaSample(TypedDict, total=False):
+    system: NotRequired[str]
+    instruction: NotRequired[str]
+    input: NotRequired[str]
+    output: NotRequired[str]
+
+
+def alpaca_converter(raw_sample: AlpacaSample) -> SFTSample:
+    """Convert Alpaca sample to SFT sample.
+
+    Args:
+        raw_sample (AlpacaSample): Alpaca sample.
+
+    Returns:
+        SFTSample: SFT sample.
+    """
+    messages = []
+    if "system" in raw_sample:
+        messages.append(
+            {"role": "system", "content": [{"type": "text", "value": raw_sample["system"]}], "loss_weight": 0.0}
+        )
+
+    if "instruction" in raw_sample or "input" in raw_sample:
+        messages.append(
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "value": raw_sample.get("instruction", "") + raw_sample.get("input", "")}
+                ],
+                "loss_weight": 0.0,
+            }
+        )
+
+    if "output" in raw_sample:
+        messages.append(
+            {"role": "assistant", "content": [{"type": "text", "value": raw_sample["output"]}], "loss_weight": 1.0}
+        )
+
+    return {"messages": messages}
+
+
+CONVERTERS = {
+    "alpaca": alpaca_converter,
+}
+
+
+def get_converter(converter_name: str) -> Callable[[dict], Sample]:
+    if converter_name not in CONVERTERS:
+        raise ValueError(f"Converter {converter_name} not found.")
+
+    return CONVERTERS[converter_name]
--- a/src/llamafactory/v1/plugins/data_plugins/loader.py
+++ b/src/llamafactory/v1/plugins/data_plugins/loader.py
@@ -15,7 +15,7 @@

 import os
 from dataclasses import dataclass
-from typing import Literal, Optional, Union
+from typing import Any, Literal, Optional, Union

 from datasets import load_dataset

@@ -25,7 +25,10 @@ from ...extras.types import DatasetInfo, HFDataset

@dataclass
 class DataLoaderPlugin:
+    """Plugin for loading dataset."""
+
    args: DataArguments
+    """Data arguments."""

    def _get_builder_name(self, path: str) -> Literal["arrow", "csv", "json", "parquet", "text"]:
        """Get dataset builder name.
@@ -66,9 +69,21 @@ class DataLoaderPlugin:

@dataclass
 class DataIndexPlugin:
+    """Plugin for adjusting dataset index."""
+
    def adjust_data_index(
        self, data_index: list[tuple[str, int]], size: Optional[int], weight: Optional[float]
    ) -> list[tuple[str, int]]:
+        """Adjust dataset index by size and weight.
+
+        Args:
+            data_index (list[tuple[str, int]]): List of (dataset_name, sample_index).
+            size (Optional[int]): Desired dataset size.
+            weight (Optional[float]): Desired dataset weight.
+
+        Returns:
+            list[tuple[str, int]]: Adjusted dataset index.
+        """
        if size is not None:
            data_index = self.adjust_by_size(data_index, size)

@@ -85,18 +100,24 @@ class DataIndexPlugin:


@dataclass
-class DataGetItemPlugin:
-    datasets: dict[str, HFDataset]
+class DataSelectorPlugin:
+    """Plugin for selecting dataset samples."""
+
    data_index: list[tuple[str, int]]
+    """List of (dataset_name, sample_index)"""

-    def _get_by_index(self, index: int) -> dict:
-        dataset_name, sample_index = self.data_index[index]
-        return {"_dataset_name": dataset_name, **self.datasets[dataset_name][sample_index]}
+    def select(self, index: Union[slice, list[int], Any]) -> Union[tuple[str, int], list[tuple[str, int]]]:
+        """Select dataset samples.

-    def get_data(self, index: Union[slice, list[int]]) -> list[dict]:
+        Args:
+            index (Union[slice, list[int], Any]): Index of dataset samples.
+
+        Returns:
+            Union[tuple[str, int], list[tuple[str, int]]]: Selected dataset samples.
+        """
        if isinstance(index, slice):
-            return [self._get_by_index(i) for i in range(*index.indices(len(self.data_index)))]
+            return [self.data_index[i] for i in range(*index.indices(len(self.data_index)))]
        elif isinstance(index, list):
-            return [self._get_by_index(i) for i in index]
+            return [self.data_index[i] for i in index]
        else:
            raise ValueError(f"Invalid index type {type(index)}.")