use pre-commit

Former-commit-id: 7cfede95df22a9ff236788f04159b6b16b8d04bb
2024-10-29 09:07:46 +00:00
parent 8f5921692e
commit 248d5daaff
66 changed files with 1028 additions and 1044 deletions
--- a/src/llamafactory/data/aligner.py
+++ b/src/llamafactory/data/aligner.py
@@ -161,7 +161,7 @@ def convert_sharegpt(
    broken_data = False
    for turn_idx, message in enumerate(messages):
        if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]:
-            logger.warning("Invalid role tag in {}.".format(messages))
+            logger.warning(f"Invalid role tag in {messages}.")
            broken_data = True

        aligned_messages.append(
@@ -171,7 +171,7 @@ def convert_sharegpt(
    if (not dataset_attr.ranking and len(aligned_messages) % 2 != 0) or (
        dataset_attr.ranking and len(aligned_messages) % 2 == 0
    ):
-        logger.warning("Invalid message count in {}.".format(messages))
+        logger.warning(f"Invalid message count in {messages}.")
        broken_data = True

    if dataset_attr.kto_tag and isinstance(example[dataset_attr.kto_tag], bool):  # kto example
@@ -192,7 +192,7 @@ def convert_sharegpt(
            chosen[dataset_attr.role_tag] not in accept_tags[-1]
            or rejected[dataset_attr.role_tag] not in accept_tags[-1]
        ):
-            logger.warning("Invalid role tag in {}.".format([chosen, rejected]))
+            logger.warning(f"Invalid role tag in {[chosen, rejected]}.")
            broken_data = True

        prompt = aligned_messages
--- a/src/llamafactory/data/collator.py
+++ b/src/llamafactory/data/collator.py
@@ -137,9 +137,9 @@ class PairwiseDataCollatorWithPadding(MultiModalDataCollatorForSeq2Seq):
        for key in ("chosen", "rejected"):
            for feature in features:
                target_feature = {
-                    "input_ids": feature["{}_input_ids".format(key)],
-                    "attention_mask": feature["{}_attention_mask".format(key)],
-                    "labels": feature["{}_labels".format(key)],
+                    "input_ids": feature[f"{key}_input_ids"],
+                    "attention_mask": feature[f"{key}_attention_mask"],
+                    "labels": feature[f"{key}_labels"],
                    "images": feature["images"],
                    "videos": feature["videos"],
                }
--- a/src/llamafactory/data/data_utils.py
+++ b/src/llamafactory/data/data_utils.py
@@ -70,7 +70,7 @@ def merge_dataset(
            stopping_strategy="first_exhausted" if data_args.mix_strategy.endswith("under") else "all_exhausted",
        )
    else:
-        raise ValueError("Unknown mixing strategy: {}.".format(data_args.mix_strategy))
+        raise ValueError(f"Unknown mixing strategy: {data_args.mix_strategy}.")


 def split_dataset(
--- a/src/llamafactory/data/formatter.py
+++ b/src/llamafactory/data/formatter.py
@@ -83,14 +83,14 @@ class StringFormatter(Formatter):
            if isinstance(slot, str):
                for name, value in kwargs.items():
                    if not isinstance(value, str):
-                        raise RuntimeError("Expected a string, got {}".format(value))
+                        raise RuntimeError(f"Expected a string, got {value}")

                    slot = slot.replace("{{" + name + "}}", value, 1)
                elements.append(slot)
            elif isinstance(slot, (dict, set)):
                elements.append(slot)
            else:
-                raise RuntimeError("Input must be string, set[str] or dict[str, str], got {}".format(type(slot)))
+                raise RuntimeError(f"Input must be string, set[str] or dict[str, str], got {type(slot)}")

        return elements

@@ -113,7 +113,7 @@ class FunctionFormatter(Formatter):
                functions.append((tool_call["name"], json.dumps(tool_call["arguments"], ensure_ascii=False)))

        except json.JSONDecodeError:
-            raise RuntimeError("Invalid JSON format in function message: {}".format(str([content])))  # flat string
+            raise RuntimeError(f"Invalid JSON format in function message: {str([content])}")  # flat string

        elements = []
        for name, arguments in functions:
@@ -124,7 +124,7 @@ class FunctionFormatter(Formatter):
                elif isinstance(slot, (dict, set)):
                    elements.append(slot)
                else:
-                    raise RuntimeError("Input must be string, set[str] or dict[str, str], got {}".format(type(slot)))
+                    raise RuntimeError(f"Input must be string, set[str] or dict[str, str], got {type(slot)}")

        return elements

@@ -141,7 +141,7 @@ class ToolFormatter(Formatter):
            tools = json.loads(content)
            return [self.tool_utils.tool_formatter(tools) if len(tools) != 0 else ""]
        except json.JSONDecodeError:
-            raise RuntimeError("Invalid JSON format in tool description: {}".format(str([content])))  # flat string
+            raise RuntimeError(f"Invalid JSON format in tool description: {str([content])}")  # flat string

    @override
    def extract(self, content: str) -> Union[str, List["FunctionCall"]]:
--- a/src/llamafactory/data/loader.py
+++ b/src/llamafactory/data/loader.py
@@ -51,7 +51,7 @@ def _load_single_dataset(
    r"""
    Loads a single dataset and aligns it to the standard format.
    """
-    logger.info("Loading dataset {}...".format(dataset_attr))
+    logger.info(f"Loading dataset {dataset_attr}...")
    data_path, data_name, data_dir, data_files = None, None, None, None
    if dataset_attr.load_from in ["hf_hub", "ms_hub", "om_hub"]:
        data_path = dataset_attr.dataset_name
@@ -77,12 +77,12 @@ def _load_single_dataset(
            data_files.append(local_path)
            data_path = FILEEXT2TYPE.get(local_path.split(".")[-1], None)
        else:
-            raise ValueError("File {} not found.".format(local_path))
+            raise ValueError(f"File {local_path} not found.")

        if data_path is None:
            raise ValueError("Allowed file types: {}.".format(",".join(FILEEXT2TYPE.keys())))
    else:
-        raise NotImplementedError("Unknown load type: {}.".format(dataset_attr.load_from))
+        raise NotImplementedError(f"Unknown load type: {dataset_attr.load_from}.")

    if dataset_attr.load_from == "ms_hub":
        require_version("modelscope>=1.11.0", "To fix: pip install modelscope>=1.11.0")
@@ -145,7 +145,7 @@ def _load_single_dataset(

        assert len(indexes) == dataset_attr.num_samples, "Sample num mismatched."
        dataset = dataset.select(indexes)
-        logger.info("Sampled {} examples from dataset {}.".format(dataset_attr.num_samples, dataset_attr))
+        logger.info(f"Sampled {dataset_attr.num_samples} examples from dataset {dataset_attr}.")

    if data_args.max_samples is not None:  # truncate dataset
        max_samples = min(data_args.max_samples, len(dataset))
@@ -243,7 +243,7 @@ def get_dataset(
        if has_tokenized_data(data_args.tokenized_path):
            logger.warning("Loading dataset from disk will ignore other data arguments.")
            dataset_dict: "DatasetDict" = load_from_disk(data_args.tokenized_path)
-            logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path))
+            logger.info(f"Loaded tokenized dataset from {data_args.tokenized_path}.")

            dataset_module: Dict[str, "Dataset"] = {}
            if "train" in dataset_dict:
@@ -294,8 +294,8 @@ def get_dataset(
        if data_args.tokenized_path is not None:
            if training_args.should_save:
                dataset_dict.save_to_disk(data_args.tokenized_path)
-                logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path))
-                logger.info("Please restart the training with `tokenized_path: {}`.".format(data_args.tokenized_path))
+                logger.info(f"Tokenized dataset saved at {data_args.tokenized_path}.")
+                logger.info(f"Please restart the training with `tokenized_path: {data_args.tokenized_path}`.")

            sys.exit(0)

--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -111,7 +111,7 @@ class BasePlugin:
                    image = Image.open(image["path"])

            if not isinstance(image, ImageObject):
-                raise ValueError("Expect input is a list of Images, but got {}.".format(type(image)))
+                raise ValueError(f"Expect input is a list of Images, but got {type(image)}.")

            results.append(self._preprocess_image(image, **kwargs))

@@ -253,7 +253,7 @@ class LlavaPlugin(BasePlugin):
            message["content"] = content.replace("{{image}}", self.image_token * image_seqlen)

        if len(images) != num_image_tokens:
-            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+            raise ValueError(f"The number of images does not match the number of {IMAGE_PLACEHOLDER} tokens")

        return messages

@@ -302,7 +302,7 @@ class LlavaNextPlugin(BasePlugin):
            message["content"] = content.replace("{{image}}", self.image_token)

        if len(images) != num_image_tokens:
-            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+            raise ValueError(f"The number of images does not match the number of {IMAGE_PLACEHOLDER} tokens")
        return messages

    @override
@@ -366,10 +366,10 @@ class LlavaNextVideoPlugin(BasePlugin):
                message["content"] = content.replace("{{video}}", self.video_token * video_seqlen)

        if len(images) != num_image_tokens:
-            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+            raise ValueError(f"The number of images does not match the number of {IMAGE_PLACEHOLDER} tokens")

        if len(videos) != num_video_tokens:
-            raise ValueError("The number of videos does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+            raise ValueError(f"The number of videos does not match the number of {IMAGE_PLACEHOLDER} tokens")

        return messages

@@ -408,7 +408,7 @@ class PaliGemmaPlugin(BasePlugin):
            message["content"] = content.replace("{{image}}", "")

        if len(images) != num_image_tokens:
-            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+            raise ValueError(f"The number of images does not match the number of {IMAGE_PLACEHOLDER} tokens")

        return messages

@@ -493,7 +493,7 @@ class Qwen2vlPlugin(BasePlugin):
            content = message["content"]
            while IMAGE_PLACEHOLDER in content:
                if num_image_tokens >= len(image_grid_thw):
-                    raise ValueError("`len(images)` is less than the number of {} tokens.".format(IMAGE_PLACEHOLDER))
+                    raise ValueError(f"`len(images)` is less than the number of {IMAGE_PLACEHOLDER} tokens.")

                content = content.replace(
                    IMAGE_PLACEHOLDER,
@@ -506,7 +506,7 @@ class Qwen2vlPlugin(BasePlugin):

            while VIDEO_PLACEHOLDER in content:
                if num_video_tokens >= len(video_grid_thw):
-                    raise ValueError("`len(videos)` is less than the number of {} tokens.".format(VIDEO_PLACEHOLDER))
+                    raise ValueError(f"`len(videos)` is less than the number of {VIDEO_PLACEHOLDER} tokens.")

                content = content.replace(
                    VIDEO_PLACEHOLDER,
@@ -520,10 +520,10 @@ class Qwen2vlPlugin(BasePlugin):
            message["content"] = content

        if len(images) != num_image_tokens:
-            raise ValueError("The number of images does not match the number of {} tokens".format(IMAGE_PLACEHOLDER))
+            raise ValueError(f"The number of images does not match the number of {IMAGE_PLACEHOLDER} tokens")

        if len(videos) != num_video_tokens:
-            raise ValueError("The number of videos does not match the number of {} tokens".format(VIDEO_PLACEHOLDER))
+            raise ValueError(f"The number of videos does not match the number of {VIDEO_PLACEHOLDER} tokens")

        return messages

@@ -583,10 +583,10 @@ class VideoLlavaPlugin(BasePlugin):
                message["content"] = content.replace("{{video}}", self.video_token * video_seqlen)

        if len(images) != num_image_tokens:
-            raise ValueError("The number of images does not match the number of {} tokens".format(self.image_token))
+            raise ValueError(f"The number of images does not match the number of {self.image_token} tokens")

        if len(videos) != num_video_tokens:
-            raise ValueError("The number of videos does not match the number of {} tokens".format(self.video_token))
+            raise ValueError(f"The number of videos does not match the number of {self.video_token} tokens")

        return messages

@@ -622,6 +622,6 @@ def get_mm_plugin(
 ) -> "BasePlugin":
    plugin_class = PLUGINS.get(name, None)
    if plugin_class is None:
-        raise ValueError("Multimodal plugin `{}` not found.".format(name))
+        raise ValueError(f"Multimodal plugin `{name}` not found.")

    return plugin_class(image_token, video_token)
--- a/src/llamafactory/data/parser.py
+++ b/src/llamafactory/data/parser.py
@@ -87,11 +87,11 @@ def get_dataset_list(dataset_names: Optional[Sequence[str]], dataset_dir: str) -
            config_path = os.path.join(dataset_dir, DATA_CONFIG)

        try:
-            with open(config_path, "r") as f:
+            with open(config_path) as f:
                dataset_info = json.load(f)
        except Exception as err:
            if len(dataset_names) != 0:
-                raise ValueError("Cannot open {} due to {}.".format(config_path, str(err)))
+                raise ValueError(f"Cannot open {config_path} due to {str(err)}.")

            dataset_info = None

@@ -109,7 +109,7 @@ def get_dataset_list(dataset_names: Optional[Sequence[str]], dataset_dir: str) -
            continue

        if name not in dataset_info:
-            raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
+            raise ValueError(f"Undefined dataset {name} in {DATA_CONFIG}.")

        has_hf_url = "hf_hub_url" in dataset_info[name]
        has_ms_url = "ms_hub_url" in dataset_info[name]
--- a/src/llamafactory/data/processors/pairwise.py
+++ b/src/llamafactory/data/processors/pairwise.py
@@ -110,8 +110,8 @@ def print_pairwise_dataset_example(example: Dict[str, List[int]], tokenizer: "Pr
    print("chosen_input_ids:\n{}".format(example["chosen_input_ids"]))
    print("chosen_inputs:\n{}".format(tokenizer.decode(example["chosen_input_ids"], skip_special_tokens=False)))
    print("chosen_label_ids:\n{}".format(example["chosen_labels"]))
-    print("chosen_labels:\n{}".format(tokenizer.decode(valid_chosen_labels, skip_special_tokens=False)))
+    print(f"chosen_labels:\n{tokenizer.decode(valid_chosen_labels, skip_special_tokens=False)}")
    print("rejected_input_ids:\n{}".format(example["rejected_input_ids"]))
    print("rejected_inputs:\n{}".format(tokenizer.decode(example["rejected_input_ids"], skip_special_tokens=False)))
    print("rejected_label_ids:\n{}".format(example["rejected_labels"]))
-    print("rejected_labels:\n{}".format(tokenizer.decode(valid_rejected_labels, skip_special_tokens=False)))
+    print(f"rejected_labels:\n{tokenizer.decode(valid_rejected_labels, skip_special_tokens=False)}")
--- a/src/llamafactory/data/processors/supervised.py
+++ b/src/llamafactory/data/processors/supervised.py
@@ -160,7 +160,7 @@ def preprocess_packed_supervised_dataset(
        )
        length = len(input_ids)
        if length > data_args.cutoff_len:
-            logger.warning("Dropped lengthy example with length {} > {}.".format(length, data_args.cutoff_len))
+            logger.warning(f"Dropped lengthy example with length {length} > {data_args.cutoff_len}.")
        else:
            lengths.append(length)
            length2indexes[length].append(valid_num)
@@ -212,4 +212,4 @@ def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "
    print("input_ids:\n{}".format(example["input_ids"]))
    print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
    print("label_ids:\n{}".format(example["labels"]))
-    print("labels:\n{}".format(tokenizer.decode(valid_labels, skip_special_tokens=False)))
+    print(f"labels:\n{tokenizer.decode(valid_labels, skip_special_tokens=False)}")
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -147,7 +147,7 @@ class Template:
                elif "eos_token" in elem and tokenizer.eos_token_id is not None:
                    token_ids += [tokenizer.eos_token_id]
            else:
-                raise ValueError("Input must be string, set[str] or dict[str, str], got {}".format(type(elem)))
+                raise ValueError(f"Input must be string, set[str] or dict[str, str], got {type(elem)}")

        return token_ids

@@ -275,9 +275,9 @@ def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str)
    num_added_tokens = tokenizer.add_special_tokens({"eos_token": eos_token})

    if is_added:
-        logger.info("Add eos token: {}".format(tokenizer.eos_token))
+        logger.info(f"Add eos token: {tokenizer.eos_token}")
    else:
-        logger.info("Replace eos token: {}".format(tokenizer.eos_token))
+        logger.info(f"Replace eos token: {tokenizer.eos_token}")

    if num_added_tokens > 0:
        logger.warning("New tokens have been added, make sure `resize_vocab` is True.")
@@ -365,13 +365,13 @@ def get_template_and_fix_tokenizer(tokenizer: "PreTrainedTokenizer", data_args:
    else:
        template = TEMPLATES.get(data_args.template, None)
        if template is None:
-            raise ValueError("Template {} does not exist.".format(data_args.template))
+            raise ValueError(f"Template {data_args.template} does not exist.")

    if data_args.train_on_prompt and template.efficient_eos:
        raise ValueError("Current template does not support `train_on_prompt`.")

    if data_args.tool_format is not None:
-        logger.info("Using tool format: {}.".format(data_args.tool_format))
+        logger.info(f"Using tool format: {data_args.tool_format}.")
        eos_slots = [] if template.efficient_eos else [{"eos_token"}]
        template.format_function = FunctionFormatter(slots=eos_slots, tool_format=data_args.tool_format)
        template.format_tools = ToolFormatter(tool_format=data_args.tool_format)
@@ -389,7 +389,7 @@ def get_template_and_fix_tokenizer(tokenizer: "PreTrainedTokenizer", data_args:

    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
-        logger.info("Add pad token: {}".format(tokenizer.pad_token))
+        logger.info(f"Add pad token: {tokenizer.pad_token}")

    if stop_words:
        num_added_tokens = tokenizer.add_special_tokens(
--- a/src/llamafactory/data/tool_utils.py
+++ b/src/llamafactory/data/tool_utils.py
@@ -177,6 +177,6 @@ TOOLS = {
 def get_tool_utils(name: str) -> "ToolUtils":
    tool_utils = TOOLS.get(name, None)
    if tool_utils is None:
-        raise ValueError("Tool utils `{}` not found.".format(name))
+        raise ValueError(f"Tool utils `{name}` not found.")

    return tool_utils