add pre-training script

Former-commit-id: 935d58de2b3a2eadc4f0bed28c3ad7dee32e9fd5
This commit is contained in:
hiyouga
2023-05-29 21:37:22 +08:00
parent 304be6dc28
commit 33fee45217
6 changed files with 159 additions and 21 deletions

View File

@@ -18,14 +18,14 @@ from utils import (
def main():
# prepare pretrained model and dataset
# Prepare pretrained model and dataset
model_args, data_args, training_args, finetuning_args = prepare_args(stage="rm")
dataset = prepare_data(model_args, data_args)
model, tokenizer = load_pretrained(model_args, finetuning_args, training_args.do_train, stage="rm")
dataset = preprocess_data(dataset, tokenizer, data_args, training_args, stage="rm")
data_collator = PairwiseDataCollatorForLLaMA(tokenizer, model.pretrained_model)
training_args.remove_unused_columns = False # Important for pairwise dataset
training_args.remove_unused_columns = False # important for pairwise dataset
# Split the dataset
if training_args.do_train: