implement efficient packing without cross-contamination attention

Former-commit-id: a64a5305c0da5ef092d4cc26faf829bb44de65d1
2024-06-12 11:56:01 +07:00
parent 6d9fbb3fa9
commit c7ab302c69
9 changed files with 287 additions and 8 deletions
--- a/src/llamafactory/data/processors/supervised.py
+++ b/src/llamafactory/data/processors/supervised.py
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
    from transformers import ProcessorMixin
    from transformers.tokenization_utils import PreTrainedTokenizer

-    from ...hparams import DataArguments
+    from ...hparams import DataArguments, FinetuningArguments
    from ..template import Template


@@ -140,11 +140,12 @@ def preprocess_packed_supervised_dataset(
    model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
    knapsacks = greedy_knapsack(lengths, data_args.cutoff_len)
    for knapsack in knapsacks:
-        packed_input_ids, packed_labels = [], []
-        for length in knapsack:
+        packed_input_ids, packed_attention_mask, packed_labels = [], [], []
+        for i, length in enumerate(knapsack):
            index = length2indexes[length].pop()
            packed_input_ids += batch_input_ids[index]
            packed_labels += batch_labels[index]
+            packed_attention_mask += [i+1]*len(batch_input_ids[index])

        if len(packed_input_ids) < data_args.cutoff_len:
            pad_length = data_args.cutoff_len - len(packed_input_ids)
@@ -155,7 +156,10 @@ def preprocess_packed_supervised_dataset(
            raise ValueError("The length of packed example should be identical to the cutoff length.")

        model_inputs["input_ids"].append(packed_input_ids)
-        model_inputs["attention_mask"].append([1] * data_args.cutoff_len)
+        if data_args.efficient_packing:
+            model_inputs["attention_mask"].append(packed_attention_mask)
+        else:
+            model_inputs["attention_mask"].append([1] * data_args.cutoff_len)
        model_inputs["labels"].append(packed_labels)

    return model_inputs