[model] support audio (#6701)

* support qwen2_audio * improve code * lint * fix * fix * fix --------- Co-authored-by: hiyouga <hiyouga@buaa.edu.cn> Former-commit-id: 5eacb5629e4d7733cd992a63747a1335f2c6a929
2025-02-05 04:59:09 +08:00
parent 9feb78e7b4
commit 8f401e37f8
35 changed files with 675 additions and 213 deletions
--- a/src/llamafactory/webui/common.py
+++ b/src/llamafactory/webui/common.py
@@ -26,9 +26,9 @@ from ..extras import logging
 from ..extras.constants import (
    DATA_CONFIG,
    DEFAULT_TEMPLATE,
+    MULTIMODAL_SUPPORTED_MODELS,
    SUPPORTED_MODELS,
    TRAINING_ARGS,
-    VISION_MODELS,
    DownloadSource,
 )
 from ..extras.misc import use_modelscope, use_openmind
@@ -136,13 +136,6 @@ def get_template(model_name: str) -> str:
    return DEFAULT_TEMPLATE.get(model_name, "default")


-def get_visual(model_name: str) -> bool:
-    r"""
-    Judges if the model is a vision language model.
-    """
-    return model_name in VISION_MODELS
-
-
 def get_time() -> str:
    r"""
    Gets current date and time.
@@ -150,6 +143,13 @@ def get_time() -> str:
    return datetime.now().strftime(r"%Y-%m-%d-%H-%M-%S")


+def is_multimodal(model_name: str) -> bool:
+    r"""
+    Judges if the model is a vision language model.
+    """
+    return model_name in MULTIMODAL_SUPPORTED_MODELS
+
+
 def load_dataset_info(dataset_dir: str) -> Dict[str, Dict[str, Any]]:
    r"""
    Loads dataset_info.json.