support multiimage inference

Former-commit-id: 8083e4607549e805eb308c4e93c8aa256202f438
2024-11-01 07:25:20 +00:00
parent a63e624eca
commit 8cea5cd967
7 changed files with 103 additions and 63 deletions
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -226,6 +226,14 @@ class BasePlugin:
    ) -> Dict[str, Union[List[int], "torch.Tensor"]]:
        r"""
        Builds batched multimodal inputs for VLMs.
+
+        Arguments:
+            images: a list of image inputs, shape (num_images,)
+            videos: a list of video inputs, shape (num_videos,)
+            imglens: number of images in each sample, shape (batch_size,)
+            vidlens: number of videos in each sample, shape (batch_size,)
+            seqlens: number of tokens in each sample, shape (batch_size,)
+            processor: a processor for pre-processing images and videos
        """
        self._validate_input(images, videos)
        return {}