[data] Fix Qwen3VL plugin (#9297)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Yaowei Zheng <hiyouga@buaa.edu.cn>
Co-authored-by: kingsley <kingsleydodonow@gmail.com>
This commit is contained in:
Xiaosu Zhu
2025-10-26 16:07:04 +08:00
committed by GitHub
parent 9c0d033a15
commit 129e918106
2 changed files with 89 additions and 34 deletions

View File

@@ -16,6 +16,7 @@ import gc
import json
from typing import Optional
import av
import fire
from tqdm import tqdm
from transformers import Seq2SeqTrainingArguments
@@ -33,6 +34,14 @@ if is_vllm_available():
from vllm.lora.request import LoRARequest
def _need_video_kwargs(template):
NEEDED_TEMPLATE = ["qwen3_vl", "glm4v"]
if any(t in template for t in NEEDED_TEMPLATE):
return True
return False
def vllm_infer(
model_name_or_path: str,
adapter_name_or_path: str = None,
@@ -132,6 +141,7 @@ def vllm_infer(
# Store all results in these lists
all_prompts, all_preds, all_labels = [], [], []
need_video_kwargs = _need_video_kwargs(template)
# Add batch process to avoid the issue of too many files opened
for i in tqdm(range(0, len(train_dataset), batch_size), desc="Processing batched inference"):
@@ -147,6 +157,7 @@ def vllm_infer(
)["images"]
}
elif batch["videos"][j] is not None:
video_metadata, video_metadata_kwargs = None, None
video = batch["videos"][j]
multi_modal_data = {
"video": template_obj.mm_plugin._regularize_videos(
@@ -157,6 +168,25 @@ def vllm_infer(
video_maxlen=video_maxlen,
)["videos"]
}
if need_video_kwargs:
container = av.open(video[0], "r")
video_stream = next(stream for stream in container.streams if stream.type == "video")
sampling_indices = template_obj.mm_plugin._get_video_sample_indices(
video_stream, video_fps, video_maxlen
)
total_frames = video_stream.frames
video_metadata_kwargs = {
"fps": getattr(tokenizer_module["processor"], "video_fps", 24.0),
"do_sample_frames": False,
"total_num_frames": total_frames,
}
video_metadata = dict(
fps=video_fps,
frames_indices=sampling_indices,
total_num_frames=total_frames,
video_backend="opencv",
)
multi_modal_data["video"] = (multi_modal_data["video"], video_metadata)
elif batch["audios"][j] is not None:
audio = batch["audios"][j]
audio_data = template_obj.mm_plugin._regularize_audios(
@@ -167,7 +197,11 @@ def vllm_infer(
else:
multi_modal_data = None
vllm_inputs.append({"prompt_token_ids": batch["input_ids"][j], "multi_modal_data": multi_modal_data})
vllm_input_data = {"prompt_token_ids": batch["input_ids"][j], "multi_modal_data": multi_modal_data}
if "video_metadata_kwargs" in locals() and video_metadata_kwargs is not None:
vllm_input_data["mm_processor_kwargs"] = video_metadata_kwargs
vllm_inputs.append(vllm_input_data)
prompts.append(tokenizer.decode(batch["input_ids"][j], skip_special_tokens=skip_special_tokens))
labels.append(
tokenizer.decode(