support BLOOM models

Former-commit-id: 1314b6ea39a01aa8ac325e1d875ac013d43aec45
This commit is contained in:
hiyouga
2023-05-31 16:54:06 +08:00
parent 181c776b58
commit 693c049eac
16 changed files with 134 additions and 90 deletions

View File

@@ -75,7 +75,7 @@ def prepare_model_for_training(
model: PreTrainedModel,
output_embedding_layer_name: Optional[str] = "lm_head",
use_gradient_checkpointing: Optional[bool] = True,
layer_norm_names: Optional[List[str]] = ["norm"] # for LLaMA setting
layer_norm_names: Optional[List[str]] = ["norm", "ln_f"] # for LLaMA and BLOOM setting
) -> PreTrainedModel:
for name, param in model.named_parameters():
@@ -143,29 +143,6 @@ def load_valuehead_params(model: torch.nn.Module, checkpoint_dir: os.PathLike) -
model.register_buffer("default_head_bias", torch.zeros_like(valuehead_state_dict["summary.bias"]))
def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
r"""
Configures device map for LLaMA.
Borrowed from: https://github.com/THUDM/ChatGLM-6B/blob/dev_multi_gpu/utils.py#L8
"""
num_layers = 28
layers_per_gpu = 30 / num_gpus
device_map = {"model.embed_tokens": 0, "model.norm": 0, "lm_head": 0}
added_layers = 2
target_gpu = 0
for i in range(num_layers):
if added_layers >= layers_per_gpu:
target_gpu += 1
added_layers = 0
assert target_gpu < num_gpus
device_map[f"model.layers.{i}"] = target_gpu
added_layers += 1
return device_map
def smooth(scalars: List[float], weight: Optional[float] = 0.95) -> List[float]:
"""
EMA implementation according to TensorBoard.