change to right-padding, update reward score #803
Former-commit-id: baa90415bc8f5ebd423d001378b51c3a3a6c2ec7
This commit is contained in:
@@ -102,6 +102,7 @@ class PPOPeftTrainer(PPOTrainer, PeftTrainer):
|
||||
|
||||
# Get inputs
|
||||
queries, responses = self.get_inputs(batch, length_sampler, **gen_kwargs)
|
||||
self.tokenizer.padding_side = "right" # change padding side
|
||||
rewards = self.get_rewards(queries, responses, unwrapped_model)
|
||||
|
||||
# Cast to training mode
|
||||
@@ -110,6 +111,7 @@ class PPOPeftTrainer(PPOTrainer, PeftTrainer):
|
||||
|
||||
# Run PPO step
|
||||
stats = self.step(queries, responses, rewards)
|
||||
self.tokenizer.padding_side = "left" # restore padding side
|
||||
loss_meter.update(stats["ppo/loss/total"], n=len(rewards))
|
||||
reward_meter.update(torch.stack(rewards).mean().item(), n=len(rewards))
|
||||
|
||||
@@ -169,7 +171,11 @@ class PPOPeftTrainer(PPOTrainer, PeftTrainer):
|
||||
query, response = batch["input_ids"].detach().cpu(), response[:, batch["input_ids"].size(-1):].detach().cpu()
|
||||
for i in range(len(query)):
|
||||
query_length = (query[i] != self.tokenizer.pad_token_id).nonzero()[0]
|
||||
response_length = (response[i] != self.tokenizer.pad_token_id).nonzero()[-1] + 1
|
||||
response_index = (response[i] != self.tokenizer.pad_token_id).nonzero()
|
||||
if len(response_index) == 0:
|
||||
response_length = 1 # allow empty response
|
||||
else:
|
||||
response_length = response_index[-1] + 1
|
||||
queries.append(query[i, query_length:]) # remove padding from left
|
||||
responses.append(response[i, :response_length]) # remove padding from right
|
||||
|
||||
@@ -194,7 +200,11 @@ class PPOPeftTrainer(PPOTrainer, PeftTrainer):
|
||||
if values.size(0) != batch["input_ids"].size(0): # adapt to chatglm2
|
||||
values = torch.transpose(values, 0, 1)
|
||||
|
||||
rewards = [reward for reward in values[:, -1].float().detach().cpu()] # use fp32 type
|
||||
rewards = []
|
||||
for i in range(values.size(0)):
|
||||
end_index = batch["attention_mask"][i].nonzero()[-1]
|
||||
rewards.append(values[i, end_index].float().detach().cpu()) # use fp32 type
|
||||
|
||||
replace_model(unwrapped_model, target="default")
|
||||
return rewards
|
||||
|
||||
@@ -241,7 +251,7 @@ class PPOPeftTrainer(PPOTrainer, PeftTrainer):
|
||||
|
||||
for j in range(len(query_batch)):
|
||||
start = len(query_batch[j]) - 1
|
||||
if attention_mask[j, 0] == 0: # offset left padding
|
||||
if attention_mask[j, 0] == 0: # offset left padding
|
||||
start += attention_mask[j, :].nonzero()[0]
|
||||
end = start + len(response_batch[j])
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ import math
|
||||
from trl import PPOConfig
|
||||
from torch.optim import AdamW
|
||||
from typing import TYPE_CHECKING, Optional, List
|
||||
from transformers import DataCollatorForSeq2Seq
|
||||
from transformers import DataCollatorWithPadding
|
||||
from transformers.optimization import get_scheduler
|
||||
|
||||
from llmtuner.dsets import get_dataset, preprocess_dataset
|
||||
@@ -28,7 +28,9 @@ def run_ppo(
|
||||
dataset = get_dataset(model_args, data_args)
|
||||
model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo")
|
||||
dataset = preprocess_dataset(dataset, tokenizer, data_args, training_args, stage="ppo")
|
||||
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=tokenizer.pad_token_id)
|
||||
|
||||
tokenizer.padding_side = "left" # use left-padding in generation while using right-padding in training
|
||||
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
||||
|
||||
ppo_config = PPOConfig(
|
||||
model_name=model_args.model_name_or_path,
|
||||
|
||||
Reference in New Issue
Block a user