mirror of
https://github.com/hiyouga/LlamaFactory.git
synced 2026-02-01 20:23:37 +00:00
[v1] add data converter (#9263)
This commit is contained in:
@@ -0,0 +1,71 @@
|
||||
# Copyright 2025 the LlamaFactory team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from typing import Callable, NotRequired, TypedDict
|
||||
|
||||
from ...extras.types import Sample, SFTSample
|
||||
|
||||
|
||||
class AlpacaSample(TypedDict, total=False):
|
||||
system: NotRequired[str]
|
||||
instruction: NotRequired[str]
|
||||
input: NotRequired[str]
|
||||
output: NotRequired[str]
|
||||
|
||||
|
||||
def alpaca_converter(raw_sample: AlpacaSample) -> SFTSample:
|
||||
"""Convert Alpaca sample to SFT sample.
|
||||
|
||||
Args:
|
||||
raw_sample (AlpacaSample): Alpaca sample.
|
||||
|
||||
Returns:
|
||||
SFTSample: SFT sample.
|
||||
"""
|
||||
messages = []
|
||||
if "system" in raw_sample:
|
||||
messages.append(
|
||||
{"role": "system", "content": [{"type": "text", "value": raw_sample["system"]}], "loss_weight": 0.0}
|
||||
)
|
||||
|
||||
if "instruction" in raw_sample or "input" in raw_sample:
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "value": raw_sample.get("instruction", "") + raw_sample.get("input", "")}
|
||||
],
|
||||
"loss_weight": 0.0,
|
||||
}
|
||||
)
|
||||
|
||||
if "output" in raw_sample:
|
||||
messages.append(
|
||||
{"role": "assistant", "content": [{"type": "text", "value": raw_sample["output"]}], "loss_weight": 1.0}
|
||||
)
|
||||
|
||||
return {"messages": messages}
|
||||
|
||||
|
||||
CONVERTERS = {
|
||||
"alpaca": alpaca_converter,
|
||||
}
|
||||
|
||||
|
||||
def get_converter(converter_name: str) -> Callable[[dict], Sample]:
|
||||
if converter_name not in CONVERTERS:
|
||||
raise ValueError(f"Converter {converter_name} not found.")
|
||||
|
||||
return CONVERTERS[converter_name]
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, Optional, Union
|
||||
from typing import Any, Literal, Optional, Union
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
@@ -25,7 +25,10 @@ from ...extras.types import DatasetInfo, HFDataset
|
||||
|
||||
@dataclass
|
||||
class DataLoaderPlugin:
|
||||
"""Plugin for loading dataset."""
|
||||
|
||||
args: DataArguments
|
||||
"""Data arguments."""
|
||||
|
||||
def _get_builder_name(self, path: str) -> Literal["arrow", "csv", "json", "parquet", "text"]:
|
||||
"""Get dataset builder name.
|
||||
@@ -66,9 +69,21 @@ class DataLoaderPlugin:
|
||||
|
||||
@dataclass
|
||||
class DataIndexPlugin:
|
||||
"""Plugin for adjusting dataset index."""
|
||||
|
||||
def adjust_data_index(
|
||||
self, data_index: list[tuple[str, int]], size: Optional[int], weight: Optional[float]
|
||||
) -> list[tuple[str, int]]:
|
||||
"""Adjust dataset index by size and weight.
|
||||
|
||||
Args:
|
||||
data_index (list[tuple[str, int]]): List of (dataset_name, sample_index).
|
||||
size (Optional[int]): Desired dataset size.
|
||||
weight (Optional[float]): Desired dataset weight.
|
||||
|
||||
Returns:
|
||||
list[tuple[str, int]]: Adjusted dataset index.
|
||||
"""
|
||||
if size is not None:
|
||||
data_index = self.adjust_by_size(data_index, size)
|
||||
|
||||
@@ -85,18 +100,24 @@ class DataIndexPlugin:
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataGetItemPlugin:
|
||||
datasets: dict[str, HFDataset]
|
||||
class DataSelectorPlugin:
|
||||
"""Plugin for selecting dataset samples."""
|
||||
|
||||
data_index: list[tuple[str, int]]
|
||||
"""List of (dataset_name, sample_index)"""
|
||||
|
||||
def _get_by_index(self, index: int) -> dict:
|
||||
dataset_name, sample_index = self.data_index[index]
|
||||
return {"_dataset_name": dataset_name, **self.datasets[dataset_name][sample_index]}
|
||||
def select(self, index: Union[slice, list[int], Any]) -> Union[tuple[str, int], list[tuple[str, int]]]:
|
||||
"""Select dataset samples.
|
||||
|
||||
def get_data(self, index: Union[slice, list[int]]) -> list[dict]:
|
||||
Args:
|
||||
index (Union[slice, list[int], Any]): Index of dataset samples.
|
||||
|
||||
Returns:
|
||||
Union[tuple[str, int], list[tuple[str, int]]]: Selected dataset samples.
|
||||
"""
|
||||
if isinstance(index, slice):
|
||||
return [self._get_by_index(i) for i in range(*index.indices(len(self.data_index)))]
|
||||
return [self.data_index[i] for i in range(*index.indices(len(self.data_index)))]
|
||||
elif isinstance(index, list):
|
||||
return [self._get_by_index(i) for i in index]
|
||||
return [self.data_index[i] for i in index]
|
||||
else:
|
||||
raise ValueError(f"Invalid index type {type(index)}.")
|
||||
|
||||
Reference in New Issue
Block a user