Former-commit-id: c1a78a3a9f8ab9d57577cee37f9c457d60863ba2
This commit is contained in:
hiyouga
2024-06-27 20:14:48 +08:00
parent 9caf9b6f91
commit bf99223a80
7 changed files with 28 additions and 26 deletions

View File

@@ -91,7 +91,7 @@ def main():
master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
subproc = subprocess.run(
process = subprocess.run(
(
"torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
"--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
@@ -106,7 +106,7 @@ def main():
),
shell=True,
)
sys.exit(subproc.returncode)
sys.exit(process.returncode)
else:
run_exp()
elif command == Command.WEBDEMO:

View File

@@ -199,8 +199,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
if not is_torch_bf16_gpu_available():
raise ValueError("This device does not support `pure_bf16`.")
if training_args.deepspeed:
raise ValueError("`pure_bf16` is incompatible with DeepSpeed.")
if is_deepspeed_zero3_enabled():
raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
if training_args.fp16 or training_args.bf16:
raise ValueError("Turn off mixed precision training when using `pure_bf16`.")