[model] support audio (#6701)

* support qwen2_audio * improve code * lint * fix * fix * fix --------- Co-authored-by: hiyouga <hiyouga@buaa.edu.cn> Former-commit-id: 5eacb5629e4d7733cd992a63747a1335f2c6a929
2025-02-05 04:59:09 +08:00
parent 9feb78e7b4
commit 8f401e37f8
35 changed files with 675 additions and 213 deletions
--- a/src/llamafactory/chat/chat_model.py
+++ b/src/llamafactory/chat/chat_model.py
@@ -27,7 +27,7 @@ from .vllm_engine import VllmEngine


 if TYPE_CHECKING:
-    from ..data.mm_plugin import ImageInput, VideoInput
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
    from .base_engine import BaseEngine, Response


@@ -66,13 +66,14 @@ class ChatModel:
        tools: Optional[str] = None,
        images: Optional[Sequence["ImageInput"]] = None,
        videos: Optional[Sequence["VideoInput"]] = None,
+        audios: Optional[Sequence["AudioInput"]] = None,
        **input_kwargs,
    ) -> List["Response"]:
        r"""
        Gets a list of responses of the chat model.
        """
        task = asyncio.run_coroutine_threadsafe(
-            self.achat(messages, system, tools, images, videos, **input_kwargs), self._loop
+            self.achat(messages, system, tools, images, videos, audios, **input_kwargs), self._loop
        )
        return task.result()

@@ -83,12 +84,13 @@ class ChatModel:
        tools: Optional[str] = None,
        images: Optional[Sequence["ImageInput"]] = None,
        videos: Optional[Sequence["VideoInput"]] = None,
+        audios: Optional[Sequence["AudioInput"]] = None,
        **input_kwargs,
    ) -> List["Response"]:
        r"""
        Asynchronously gets a list of responses of the chat model.
        """
-        return await self.engine.chat(messages, system, tools, images, videos, **input_kwargs)
+        return await self.engine.chat(messages, system, tools, images, videos, audios, **input_kwargs)

    def stream_chat(
        self,
@@ -97,12 +99,13 @@ class ChatModel:
        tools: Optional[str] = None,
        images: Optional[Sequence["ImageInput"]] = None,
        videos: Optional[Sequence["VideoInput"]] = None,
+        audios: Optional[Sequence["AudioInput"]] = None,
        **input_kwargs,
    ) -> Generator[str, None, None]:
        r"""
        Gets the response token-by-token of the chat model.
        """
-        generator = self.astream_chat(messages, system, tools, images, videos, **input_kwargs)
+        generator = self.astream_chat(messages, system, tools, images, videos, audios, **input_kwargs)
        while True:
            try:
                task = asyncio.run_coroutine_threadsafe(generator.__anext__(), self._loop)
@@ -117,12 +120,15 @@ class ChatModel:
        tools: Optional[str] = None,
        images: Optional[Sequence["ImageInput"]] = None,
        videos: Optional[Sequence["VideoInput"]] = None,
+        audios: Optional[Sequence["AudioInput"]] = None,
        **input_kwargs,
    ) -> AsyncGenerator[str, None]:
        r"""
        Asynchronously gets the response token-by-token of the chat model.
        """
-        async for new_token in self.engine.stream_chat(messages, system, tools, images, videos, **input_kwargs):
+        async for new_token in self.engine.stream_chat(
+            messages, system, tools, images, videos, audios, **input_kwargs
+        ):
            yield new_token

    def get_scores(