Merge pull request #1553 from hannlp/hans
Change the default argument settings for PPO training Former-commit-id: 1b64678fa4979485f67c3bb1420dfdff6fcbc6e7
This commit is contained in:
@@ -74,6 +74,10 @@ class RLHFArguments:
|
||||
default=None,
|
||||
metadata={"help": "Log with either 'wandb' or 'tensorboard' in PPO training."}
|
||||
)
|
||||
ppo_epochs: Optional[int] = field(
|
||||
default=4,
|
||||
metadata={"help": "Number of optimisation epochs per batch of samples"},
|
||||
)
|
||||
ppo_score_norm: Optional[bool] = field(
|
||||
default=False,
|
||||
metadata={"help": "Use score normalization in PPO training."}
|
||||
|
||||
@@ -45,7 +45,7 @@ def run_ppo(
|
||||
mini_batch_size=training_args.per_device_train_batch_size,
|
||||
batch_size=training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps,
|
||||
gradient_accumulation_steps=training_args.gradient_accumulation_steps,
|
||||
ppo_epochs=1,
|
||||
ppo_epochs=finetuning_args.ppo_epochs,
|
||||
max_grad_norm=training_args.max_grad_norm,
|
||||
seed=training_args.seed,
|
||||
optimize_device_cache=True,
|
||||
|
||||
Reference in New Issue
Block a user