implement efficient packing without cross-contamination attention

Former-commit-id: a64a5305c0da5ef092d4cc26faf829bb44de65d1
This commit is contained in:
ancv
2024-06-12 11:56:01 +07:00
parent 6d9fbb3fa9
commit c7ab302c69
9 changed files with 287 additions and 8 deletions

View File

@@ -10,7 +10,7 @@ if TYPE_CHECKING:
from transformers import ProcessorMixin
from transformers.tokenization_utils import PreTrainedTokenizer
from ...hparams import DataArguments
from ...hparams import DataArguments, FinetuningArguments
from ..template import Template
@@ -140,11 +140,12 @@ def preprocess_packed_supervised_dataset(
model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
knapsacks = greedy_knapsack(lengths, data_args.cutoff_len)
for knapsack in knapsacks:
packed_input_ids, packed_labels = [], []
for length in knapsack:
packed_input_ids, packed_attention_mask, packed_labels = [], [], []
for i, length in enumerate(knapsack):
index = length2indexes[length].pop()
packed_input_ids += batch_input_ids[index]
packed_labels += batch_labels[index]
packed_attention_mask += [i+1]*len(batch_input_ids[index])
if len(packed_input_ids) < data_args.cutoff_len:
pad_length = data_args.cutoff_len - len(packed_input_ids)
@@ -155,7 +156,10 @@ def preprocess_packed_supervised_dataset(
raise ValueError("The length of packed example should be identical to the cutoff length.")
model_inputs["input_ids"].append(packed_input_ids)
model_inputs["attention_mask"].append([1] * data_args.cutoff_len)
if data_args.efficient_packing:
model_inputs["attention_mask"].append(packed_attention_mask)
else:
model_inputs["attention_mask"].append([1] * data_args.cutoff_len)
model_inputs["labels"].append(packed_labels)
return model_inputs