[misc] upgrade format to py39 (#7256)

2025-03-12 00:08:41 +08:00
parent 5995800bce
commit 264538cb26
113 changed files with 984 additions and 1407 deletions
--- a/scripts/api_example/test_toolcall.py
+++ b/scripts/api_example/test_toolcall.py
@@ -14,7 +14,7 @@

 import json
 import os
-from typing import Sequence
+from collections.abc import Sequence

 from openai import OpenAI
 from transformers.utils.versions import require_version
--- a/scripts/convert_ckpt/llamafy_baichuan2.py
+++ b/scripts/convert_ckpt/llamafy_baichuan2.py
@@ -15,7 +15,7 @@
 import json
 import os
 from collections import OrderedDict
-from typing import Any, Dict
+from typing import Any

 import fire
 import torch
@@ -29,13 +29,13 @@ CONFIG_NAME = "config.json"


 def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetensors: bool):
-    baichuan2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
+    baichuan2_state_dict: dict[str, torch.Tensor] = OrderedDict()
    for filepath in tqdm(os.listdir(input_dir), desc="Load weights"):
        if os.path.isfile(os.path.join(input_dir, filepath)) and filepath.endswith(".bin"):
            shard_weight = torch.load(os.path.join(input_dir, filepath), map_location="cpu")
            baichuan2_state_dict.update(shard_weight)

-    llama_state_dict: Dict[str, torch.Tensor] = OrderedDict()
+    llama_state_dict: dict[str, torch.Tensor] = OrderedDict()
    for key, value in tqdm(baichuan2_state_dict.items(), desc="Convert format"):
        if "W_pack" in key:
            proj_size = value.size(0) // 3
@@ -75,7 +75,7 @@ def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetenso

 def save_config(input_dir: str, output_dir: str):
    with open(os.path.join(input_dir, CONFIG_NAME), encoding="utf-8") as f:
-        llama2_config_dict: Dict[str, Any] = json.load(f)
+        llama2_config_dict: dict[str, Any] = json.load(f)

    llama2_config_dict["architectures"] = ["LlamaForCausalLM"]
    llama2_config_dict.pop("auto_map", None)
@@ -94,8 +94,8 @@ def llamafy_baichuan2(
    shard_size: str = "2GB",
    save_safetensors: bool = True,
 ):
-    r"""
-    Converts the Baichuan2-7B model in the same format as LLaMA2-7B.
+    r"""Convert the Baichuan2-7B model in the same format as LLaMA2-7B.
+
    Usage: python llamafy_baichuan2.py --input_dir input --output_dir output
    Converted model: https://huggingface.co/hiyouga/Baichuan2-7B-Base-LLaMAfied
    """
--- a/scripts/convert_ckpt/llamafy_qwen.py
+++ b/scripts/convert_ckpt/llamafy_qwen.py
@@ -15,7 +15,7 @@
 import json
 import os
 from collections import OrderedDict
-from typing import Any, Dict
+from typing import Any

 import fire
 import torch
@@ -37,14 +37,14 @@ CONFIG_NAME = "config.json"


 def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetensors: bool) -> str:
-    qwen_state_dict: Dict[str, torch.Tensor] = OrderedDict()
+    qwen_state_dict: dict[str, torch.Tensor] = OrderedDict()
    for filepath in tqdm(os.listdir(input_dir), desc="Load weights"):
        if os.path.isfile(os.path.join(input_dir, filepath)) and filepath.endswith(".safetensors"):
            with safe_open(os.path.join(input_dir, filepath), framework="pt", device="cpu") as f:
                for key in f.keys():
                    qwen_state_dict[key] = f.get_tensor(key)

-    llama_state_dict: Dict[str, torch.Tensor] = OrderedDict()
+    llama_state_dict: dict[str, torch.Tensor] = OrderedDict()
    torch_dtype = None
    for key, value in tqdm(qwen_state_dict.items(), desc="Convert format"):
        if torch_dtype is None:
@@ -112,9 +112,9 @@ def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetenso

 def save_config(input_dir: str, output_dir: str, torch_dtype: str):
    with open(os.path.join(input_dir, CONFIG_NAME), encoding="utf-8") as f:
-        qwen_config_dict: Dict[str, Any] = json.load(f)
+        qwen_config_dict: dict[str, Any] = json.load(f)

-    llama2_config_dict: Dict[str, Any] = OrderedDict()
+    llama2_config_dict: dict[str, Any] = OrderedDict()
    llama2_config_dict["architectures"] = ["LlamaForCausalLM"]
    llama2_config_dict["hidden_act"] = "silu"
    llama2_config_dict["hidden_size"] = qwen_config_dict["hidden_size"]
@@ -147,8 +147,8 @@ def llamafy_qwen(
    shard_size: str = "2GB",
    save_safetensors: bool = False,
 ):
-    r"""
-    Converts the Qwen models in the same format as LLaMA2.
+    r"""Convert the Qwen models in the same format as LLaMA2.
+
    Usage: python llamafy_qwen.py --input_dir input --output_dir output
    Converted model: https://huggingface.co/hiyouga/Qwen-14B-Chat-LLaMAfied
    """
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
@@ -18,7 +18,7 @@
 import json
 import os
 from collections import OrderedDict
-from typing import TYPE_CHECKING, Dict
+from typing import TYPE_CHECKING

 import fire
 import torch
@@ -44,11 +44,11 @@ def block_expansion(
    shard_size: str = "5GB",
    save_safetensors: bool = True,
 ):
-    r"""
-    Performs block expansion for LLaMA, Mistral, Qwen2 or Yi models.
+    r"""Perform block expansion for LLaMA, Mistral, Qwen2 or Yi models.
+
    Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
    """
-    config: "PretrainedConfig" = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
+    config: PretrainedConfig = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
    num_layers = getattr(config, "num_hidden_layers")
    if num_layers % num_expand != 0:
        raise ValueError(f"`num_layers` {num_layers} should be divisible by `num_expand` {num_expand}.")
@@ -70,7 +70,7 @@ def block_expansion(
    split = num_layers // num_expand
    layer_cnt = 0
    state_dict = model.state_dict()
-    output_state_dict: Dict[str, "torch.Tensor"] = OrderedDict()
+    output_state_dict: dict[str, torch.Tensor] = OrderedDict()
    for i in range(num_layers):
        for key, value in state_dict.items():
            if f".{i:d}." in key:
--- a/scripts/loftq_init.py
+++ b/scripts/loftq_init.py
@@ -38,8 +38,8 @@ def quantize_loftq(
    lora_target: tuple = ("q_proj", "v_proj"),
    save_safetensors: bool = True,
 ):
-    r"""
-    Initializes LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ)
+    r"""Initialize LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ).
+
    Usage: python loftq_init.py --model_name_or_path path_to_model --output_dir output_dir
    """
    if isinstance(lora_target, str):
@@ -72,7 +72,7 @@ def quantize_loftq(
    print(f"Adapter weights saved in {loftq_dir}")

    # Save base model
-    base_model: "PreTrainedModel" = peft_model.unload()
+    base_model: PreTrainedModel = peft_model.unload()
    base_model.save_pretrained(output_dir, safe_serialization=save_safetensors)
    tokenizer.save_pretrained(output_dir)
    print(f"Model weights saved in {output_dir}")
--- a/scripts/pissa_init.py
+++ b/scripts/pissa_init.py
@@ -37,8 +37,8 @@ def quantize_pissa(
    lora_target: tuple = ("q_proj", "v_proj"),
    save_safetensors: bool = True,
 ):
-    r"""
-    Initializes LoRA weights with Principal Singular values and Singular vectors Adaptation (PiSSA)
+    r"""Initialize LoRA weights with Principal Singular values and Singular vectors Adaptation (PiSSA).
+
    Usage: python pissa_init.py --model_name_or_path path_to_model --output_dir output_dir
    """
    if isinstance(lora_target, str):
@@ -67,7 +67,7 @@ def quantize_pissa(
    print(f"Adapter weights saved in {pissa_dir}")

    # Save base model
-    base_model: "PreTrainedModel" = peft_model.unload()
+    base_model: PreTrainedModel = peft_model.unload()
    base_model.save_pretrained(output_dir, safe_serialization=save_safetensors)
    tokenizer.save_pretrained(output_dir)
    print(f"Model weights saved in {output_dir}")
--- a/scripts/stat_utils/cal_flops.py
+++ b/scripts/stat_utils/cal_flops.py
@@ -29,8 +29,8 @@ def calculate_flops(
    seq_length: int = 512,
    flash_attn: str = "auto",
 ):
-    r"""
-    Calculates the flops of pre-trained models.
+    r"""Calculate the flops of pre-trained models.
+
    Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
    """
    with get_accelerator().device(0):
--- a/scripts/stat_utils/cal_lr.py
+++ b/scripts/stat_utils/cal_lr.py
@@ -45,8 +45,8 @@ def calculate_lr(
    is_mistral_or_gemma: bool = False,  # mistral and gemma models opt for a smaller learning rate,
    packing: bool = False,
 ):
-    r"""
-    Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
+    r"""Calculate the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
+
    Usage:
    python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en_demo --cutoff_len 1024 --batch_size 16
    """
@@ -89,9 +89,8 @@ def calculate_lr(
    lr = BASE_LR * math.sqrt(token_batch_size / BASE_BS)  # lr ~ sqrt(batch_size)
    lr = lr / 6.0 if is_mistral_or_gemma else lr
    print(
-        "Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective token batch size {:.2f}".format(
-            lr, valid_ratio * 100, token_batch_size
-        )
+        f"Optimal learning rate is {lr:.2e} for valid ratio% {valid_ratio * 100:.2f} "
+        f"and effective token batch size {token_batch_size:.2f}"
    )


--- a/scripts/stat_utils/cal_mfu.py
+++ b/scripts/stat_utils/cal_mfu.py
@@ -34,9 +34,7 @@ def compute_model_flops(
    include_recompute: bool = False,
    include_flashattn: bool = False,
 ) -> int:
-    r"""
-    Calculates the FLOPs of model per forward/backward pass.
-    """
+    r"""Calculate the FLOPs of model per forward/backward pass."""
    config = AutoConfig.from_pretrained(model_name_or_path)
    hidden_size = getattr(config, "hidden_size", None)
    vocab_size = getattr(config, "vocab_size", None)
@@ -86,9 +84,7 @@ def compute_model_flops(


 def compute_device_flops(world_size: int) -> float:
-    r"""
-    Calculates the FLOPs of the device capability per second.
-    """
+    r"""Calculate the FLOPs of the device capability per second."""
    device_name = torch.cuda.get_device_name()
    if "H100" in device_name or "H800" in device_name:
        return 989 * 1e12 * world_size
@@ -114,8 +110,8 @@ def calculate_mfu(
    liger_kernel: bool = False,
    unsloth_gc: bool = False,
 ) -> float:
-    r"""
-    Calculates MFU for given model and hyper-params.
+    r"""Calculate MFU for given model and hyper-params.
+
    Usage: python cal_mfu.py --model_name_or_path path_to_model --batch_size 1 --seq_length 1024
    """
    args = {
--- a/scripts/stat_utils/cal_ppl.py
+++ b/scripts/stat_utils/cal_ppl.py
@@ -13,8 +13,9 @@
 # limitations under the License.

 import json
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Dict, Literal, Optional, Sequence
+from typing import Any, Literal, Optional

 import fire
 import torch
@@ -30,16 +31,12 @@ from llamafactory.model import load_model, load_tokenizer

@dataclass
 class PairwiseDataCollatorWithPadding(MultiModalDataCollatorForSeq2Seq):
-    r"""
-    Data collator for pairwise data.
-    """
+    r"""Data collator for pairwise data."""

    train_on_prompt: bool = False

-    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
-        r"""
-        Pads batched data to the longest sequence in the batch.
-        """
+    def __call__(self, features: Sequence[dict[str, Any]]) -> dict[str, torch.Tensor]:
+        r"""Pad batched data to the longest sequence in the batch."""
        chosen_features = []
        for feature in features:
            chosen_features.append(
@@ -68,8 +65,8 @@ def calculate_ppl(
    max_samples: Optional[int] = None,
    train_on_prompt: bool = False,
 ):
-    r"""
-    Calculates the ppl on the dataset of the pre-trained models.
+    r"""Calculate the ppl on the dataset of the pre-trained models.
+
    Usage: export CUDA_VISIBLE_DEVICES=0
    python cal_ppl.py --model_name_or_path path_to_model --dataset alpaca_en_demo --save_name ppl.json
    """
@@ -111,17 +108,17 @@ def calculate_ppl(
    criterion = torch.nn.CrossEntropyLoss(reduction="none")
    total_ppl = 0
    perplexities = []
-    batch: Dict[str, "torch.Tensor"]
+    batch: dict[str, torch.Tensor]
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Computing perplexities"):
            batch = batch.to(model.device)
            outputs = model(**batch)
-            shift_logits: "torch.Tensor" = outputs["logits"][..., :-1, :]
-            shift_labels: "torch.Tensor" = batch["labels"][..., 1:]
+            shift_logits: torch.Tensor = outputs["logits"][..., :-1, :]
+            shift_labels: torch.Tensor = batch["labels"][..., 1:]
            loss_mask = shift_labels != IGNORE_INDEX
            flatten_logits = shift_logits.contiguous().view(shift_labels.size(0) * shift_labels.size(1), -1)
            flatten_labels = shift_labels.contiguous().view(-1)
-            token_logps: "torch.Tensor" = criterion(flatten_logits, flatten_labels)
+            token_logps: torch.Tensor = criterion(flatten_logits, flatten_labels)
            token_logps = token_logps.contiguous().view(shift_logits.size(0), -1)
            sentence_logps = (token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
            total_ppl += sentence_logps.exp().sum().item()
--- a/scripts/stat_utils/length_cdf.py
+++ b/scripts/stat_utils/length_cdf.py
@@ -29,8 +29,8 @@ def length_cdf(
    template: str = "default",
    interval: int = 1000,
 ):
-    r"""
-    Calculates the distribution of the input lengths in the dataset.
+    r"""Calculate the distribution of the input lengths in the dataset.
+
    Usage: export CUDA_VISIBLE_DEVICES=0
    python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default
    """
--- a/scripts/vllm_infer.py
+++ b/scripts/vllm_infer.py
@@ -52,8 +52,8 @@ def vllm_infer(
    image_max_pixels: int = 768 * 768,
    image_min_pixels: int = 32 * 32,
 ):
-    r"""
-    Performs batch generation using vLLM engine, which supports tensor parallelism.
+    r"""Perform batch generation using vLLM engine, which supports tensor parallelism.
+
    Usage: python vllm_infer.py --model_name_or_path meta-llama/Llama-2-7b-hf --template llama --dataset alpaca_en_demo
    """
    check_version("vllm>=0.4.3,<=0.7.3")