fix galore
Former-commit-id: 62a3ceeef8f60caef43ccc7f971a0c9184e21296
This commit is contained in:
@@ -66,10 +66,6 @@ class LoraArguments:
|
||||
Others choices: the same as LLaMA."""
|
||||
},
|
||||
)
|
||||
lora_bf16_mode: bool = field(
|
||||
default=False,
|
||||
metadata={"help": "Whether or not to train lora adapters in bf16 precision."},
|
||||
)
|
||||
use_rslora: bool = field(
|
||||
default=False,
|
||||
metadata={"help": "Whether or not to use the rank stabilization scaling factor for LoRA layer."},
|
||||
@@ -194,6 +190,10 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
|
||||
Arguments pertaining to which techniques we are going to fine-tuning with.
|
||||
"""
|
||||
|
||||
pure_bf16: bool = field(
|
||||
default=False,
|
||||
metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."},
|
||||
)
|
||||
stage: Literal["pt", "sft", "rm", "ppo", "dpo"] = field(
|
||||
default="sft",
|
||||
metadata={"help": "Which stage will be performed in training."},
|
||||
|
||||
@@ -7,6 +7,7 @@ import torch
|
||||
import transformers
|
||||
from transformers import HfArgumentParser, Seq2SeqTrainingArguments
|
||||
from transformers.trainer_utils import get_last_checkpoint
|
||||
from transformers.utils import is_torch_bf16_gpu_available
|
||||
|
||||
from ..extras.logging import get_logger
|
||||
from ..extras.misc import check_dependencies
|
||||
@@ -156,6 +157,13 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
|
||||
if model_args.use_unsloth:
|
||||
raise ValueError("Unsloth does not support DoRA.")
|
||||
|
||||
if finetuning_args.pure_bf16:
|
||||
if not is_torch_bf16_gpu_available():
|
||||
raise ValueError("This device does not support `pure_bf16`.")
|
||||
|
||||
if training_args.fp16 or training_args.bf16:
|
||||
raise ValueError("Turn off mixed precision training when using `pure_bf16`.")
|
||||
|
||||
_verify_model_args(model_args, finetuning_args)
|
||||
|
||||
if (
|
||||
@@ -226,9 +234,11 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
|
||||
)
|
||||
|
||||
# Post-process model arguments
|
||||
model_args.compute_dtype = (
|
||||
torch.bfloat16 if training_args.bf16 else (torch.float16 if training_args.fp16 else None)
|
||||
)
|
||||
if training_args.bf16 or finetuning_args.pure_bf16:
|
||||
model_args.compute_dtype = torch.bfloat16
|
||||
elif training_args.fp16:
|
||||
model_args.compute_dtype = torch.float16
|
||||
|
||||
model_args.model_max_length = data_args.cutoff_len
|
||||
model_args.aqlm_optimization = not training_args.predict_with_generate
|
||||
|
||||
|
||||
@@ -34,7 +34,8 @@ def init_adapter(
|
||||
|
||||
if finetuning_args.finetuning_type == "full" and is_trainable:
|
||||
logger.info("Fine-tuning method: Full")
|
||||
model = model.float()
|
||||
if not finetuning_args.pure_bf16:
|
||||
model = model.float()
|
||||
|
||||
if finetuning_args.finetuning_type == "freeze" and is_trainable:
|
||||
logger.info("Fine-tuning method: Freeze")
|
||||
@@ -78,7 +79,8 @@ def init_adapter(
|
||||
|
||||
for name, param in model.named_parameters():
|
||||
if any(trainable_layer in name for trainable_layer in trainable_layers):
|
||||
param.data = param.data.to(torch.float32)
|
||||
if not finetuning_args.pure_bf16:
|
||||
param.data = param.data.to(torch.float32)
|
||||
else:
|
||||
param.requires_grad_(False)
|
||||
|
||||
@@ -150,8 +152,9 @@ def init_adapter(
|
||||
)
|
||||
model = get_peft_model(model, lora_config)
|
||||
|
||||
for param in filter(lambda p: p.requires_grad, model.parameters()):
|
||||
param.data = param.data.to(torch.bfloat16 if finetuning_args.lora_bf16_mode else torch.float32)
|
||||
if not finetuning_args.pure_bf16:
|
||||
for param in filter(lambda p: p.requires_grad, model.parameters()):
|
||||
param.data = param.data.to(torch.float32)
|
||||
|
||||
if model_args.adapter_name_or_path is not None:
|
||||
logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
|
||||
|
||||
@@ -154,14 +154,28 @@ def create_custom_optimzer(
|
||||
},
|
||||
]
|
||||
if training_args.optim == "adamw_torch":
|
||||
optimizer = GaLoreAdamW(param_groups, lr=training_args.learning_rate)
|
||||
elif training_args.optim == "adamw_8bit":
|
||||
optimizer = GaLoreAdamW8bit(param_groups, lr=training_args.learning_rate)
|
||||
optimizer = GaLoreAdamW(
|
||||
param_groups,
|
||||
lr=training_args.learning_rate,
|
||||
eps=training_args.adam_epsilon,
|
||||
betas=(training_args.adam_beta1, training_args.adam_beta2),
|
||||
)
|
||||
elif training_args.optim in ["adamw_bnb_8bit", "adamw_8bit", "paged_adamw_8bit"]:
|
||||
optimizer = GaLoreAdamW8bit(
|
||||
param_groups,
|
||||
lr=training_args.learning_rate,
|
||||
eps=training_args.adam_epsilon,
|
||||
betas=(training_args.adam_beta1, training_args.adam_beta2),
|
||||
optim_bits=8,
|
||||
is_paged="paged" in training_args.optim,
|
||||
)
|
||||
elif training_args.optim == "adafactor":
|
||||
optimizer = GaLoreAdafactor(param_groups, lr=training_args.learning_rate)
|
||||
optimizer = GaLoreAdafactor(
|
||||
param_groups,
|
||||
lr=training_args.learning_rate,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError("Unknow optim: {}".format(training_args.optim))
|
||||
|
||||
logger.info("Used the GaLore optimizer, may cause hanging at the start of training, wait patiently.")
|
||||
|
||||
logger.info("Using GaLore optimizer, may cause hanging at the start of training, wait patiently.")
|
||||
return optimizer
|
||||
|
||||
Reference in New Issue
Block a user