[breaking change] refactor data pipeline (#6901)
* refactor data * rename file Former-commit-id: 7a1a4ce6451cb782573d0bd9dd27a5e443e3a18b
This commit is contained in:
@@ -16,7 +16,7 @@ from typing import Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
from llamafactory.data.processors.processor_utils import infer_seqlen
|
||||
from llamafactory.data.processor.processor_utils import infer_seqlen
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
46
tests/data/test_converter.py
Normal file
46
tests/data/test_converter.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from llamafactory.data import Role
|
||||
from llamafactory.data.converter import get_dataset_converter
|
||||
from llamafactory.data.parser import DatasetAttr
|
||||
from llamafactory.hparams import DataArguments
|
||||
|
||||
|
||||
def test_alpaca_converter():
|
||||
dataset_attr = DatasetAttr("hf_hub", "llamafactory/tiny-supervised-dataset")
|
||||
data_args = DataArguments()
|
||||
example = {
|
||||
"instruction": "Solve the math problem.",
|
||||
"input": "3 + 4",
|
||||
"output": "The answer is 7.",
|
||||
}
|
||||
dataset_converter = get_dataset_converter("alpaca", dataset_attr, data_args)
|
||||
assert dataset_converter(example) == {
|
||||
"_prompt": [{"role": Role.USER.value, "content": "Solve the math problem.\n3 + 4"}],
|
||||
"_response": [{"role": Role.ASSISTANT.value, "content": "The answer is 7."}],
|
||||
"_system": "",
|
||||
"_tools": "",
|
||||
"_images": None,
|
||||
"_videos": None,
|
||||
"_audios": None,
|
||||
}
|
||||
|
||||
|
||||
def test_sharegpt_converter():
|
||||
dataset_attr = DatasetAttr("hf_hub", "llamafactory/tiny-supervised-dataset")
|
||||
data_args = DataArguments()
|
||||
example = {
|
||||
"conversations": [
|
||||
{"from": "system", "value": "You are a helpful assistant."},
|
||||
{"from": "human", "value": "Solve the math problem.\n3 + 4"},
|
||||
{"from": "gpt", "value": "The answer is 7."},
|
||||
]
|
||||
}
|
||||
dataset_converter = get_dataset_converter("sharegpt", dataset_attr, data_args)
|
||||
assert dataset_converter(example) == {
|
||||
"_prompt": [{"role": Role.USER.value, "content": "Solve the math problem.\n3 + 4"}],
|
||||
"_response": [{"role": Role.ASSISTANT.value, "content": "The answer is 7."}],
|
||||
"_system": "You are a helpful assistant.",
|
||||
"_tools": "",
|
||||
"_images": None,
|
||||
"_videos": None,
|
||||
"_audios": None,
|
||||
}
|
||||
Reference in New Issue
Block a user