[breaking change] refactor data pipeline (#6901)

* refactor data * rename file Former-commit-id: 7a1a4ce6451cb782573d0bd9dd27a5e443e3a18b
2025-02-13 00:39:20 +08:00
parent 80b89978d9
commit 46203856fc
27 changed files with 1145 additions and 1132 deletions
--- a/tests/data/processors/test_feedback.py
+++ b/tests/data/processors/test_feedback.py
--- a/tests/data/processors/test_pairwise.py
+++ b/tests/data/processors/test_pairwise.py
--- a/tests/data/processors/test_processor_utils.py
+++ b/tests/data/processors/test_processor_utils.py
@@ -16,7 +16,7 @@ from typing import Tuple

 import pytest

-from llamafactory.data.processors.processor_utils import infer_seqlen
+from llamafactory.data.processor.processor_utils import infer_seqlen


@pytest.mark.parametrize(
--- a/tests/data/processors/test_supervised.py
+++ b/tests/data/processors/test_supervised.py
--- a/tests/data/processors/test_unsupervised.py
+++ b/tests/data/processors/test_unsupervised.py
--- a/tests/data/test_converter.py
+++ b/tests/data/test_converter.py
@@ -0,0 +1,46 @@
+from llamafactory.data import Role
+from llamafactory.data.converter import get_dataset_converter
+from llamafactory.data.parser import DatasetAttr
+from llamafactory.hparams import DataArguments
+
+
+def test_alpaca_converter():
+    dataset_attr = DatasetAttr("hf_hub", "llamafactory/tiny-supervised-dataset")
+    data_args = DataArguments()
+    example = {
+        "instruction": "Solve the math problem.",
+        "input": "3 + 4",
+        "output": "The answer is 7.",
+    }
+    dataset_converter = get_dataset_converter("alpaca", dataset_attr, data_args)
+    assert dataset_converter(example) == {
+        "_prompt": [{"role": Role.USER.value, "content": "Solve the math problem.\n3 + 4"}],
+        "_response": [{"role": Role.ASSISTANT.value, "content": "The answer is 7."}],
+        "_system": "",
+        "_tools": "",
+        "_images": None,
+        "_videos": None,
+        "_audios": None,
+    }
+
+
+def test_sharegpt_converter():
+    dataset_attr = DatasetAttr("hf_hub", "llamafactory/tiny-supervised-dataset")
+    data_args = DataArguments()
+    example = {
+        "conversations": [
+            {"from": "system", "value": "You are a helpful assistant."},
+            {"from": "human", "value": "Solve the math problem.\n3 + 4"},
+            {"from": "gpt", "value": "The answer is 7."},
+        ]
+    }
+    dataset_converter = get_dataset_converter("sharegpt", dataset_attr, data_args)
+    assert dataset_converter(example) == {
+        "_prompt": [{"role": Role.USER.value, "content": "Solve the math problem.\n3 + 4"}],
+        "_response": [{"role": Role.ASSISTANT.value, "content": "The answer is 7."}],
+        "_system": "You are a helpful assistant.",
+        "_tools": "",
+        "_images": None,
+        "_videos": None,
+        "_audios": None,
+    }