-
-
Notifications
You must be signed in to change notification settings - Fork 10.6k
Closed
Assignees
@jaminegod
Description
Your current environment
Qwen3-vl-4B-Instruct not working when enable lora
python3 -m vllm.entrypoints.openai.api_server \
--model /models/Qwen3-vl-4B-Instruct/ \
--host 0.0.0.0 \
--port 8000 \
--served-model-name Qwen3-vl-4B-Instruct \
--trust-remote-code \
--gpu-memory-utilization 0.8 \
--max-model-len 8192 \
--api-key sk-xxxxx\
--enable-lora
------------------------------------------------------------------------------------------------
(EngineCore_DP0 pid=759) INFO 10-15 18:25:11 [default_loader.py:267] Loading weights took 1.63 seconds
(EngineCore_DP0 pid=759) WARNING 10-15 18:25:11 [lora_model_runner_mixin.py:42] Regarding multimodal models, vLLM currently only supports adding LoRA to language model.
(EngineCore_DP0 pid=759) INFO 10-15 18:25:11 [punica_selector.py:19] Using PunicaWrapperGPU.
(EngineCore_DP0 pid=759) INFO 10-15 18:25:11 [gpu_model_runner.py:2653] Model loading took 8.6632 GiB and 1.906870 seconds
(EngineCore_DP0 pid=759) INFO 10-15 18:25:11 [gpu_model_runner.py:3344] Encoder cache will be initialized with a budget of 151250 tokens, and profiled with 1 video items of the maximum feature size.
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] EngineCore failed to start.
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] Traceback (most recent call last):
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 498, in __init__
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 92, in __init__
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] self._initialize_kv_caches(vllm_config)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 190, in _initialize_kv_caches
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] self.model_executor.determine_available_memory())
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 85, in determine_available_memory
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self.collective_rpc("determine_available_memory")
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return [run_method(self.driver_worker, method, args, kwargs)]
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 3122, in run_method
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 263, in determine_available_memory
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] self.model_runner.profile_run()
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3361, in profile_run
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] self.model.get_multimodal_embeddings(
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 1381, in get_multimodal_embeddings
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] video_embeddings = self._process_video_input(multimodal_input)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 1335, in _process_video_input
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] video_embeds = self.visual(pixel_values_videos,
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 517, in forward
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] hidden_states = blk(hidden_states,
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 200, in forward
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] x = x + self.attn(self.norm1(x),
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2_5_vl.py", line 415, in forward
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] output, _ = self.proj(context_layer)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/lora/layers/row_parallel_linear.py", line 70, in forward
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] output_parallel = self.apply(input_parallel)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/lora/layers/base_linear.py", line 151, in apply
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] torch.Tensor] = self.punica_wrapper.add_lora_linear(
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 215, in add_lora_linear
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] self.add_shrink(
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 77, in add_shrink
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] lora_shrink(
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1243, in __call__
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self._op(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/lora/ops/triton_ops/lora_shrink_op.py", line 149, in _lora_shrink
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] assert token_lora_mapping.size(0) == M
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] AssertionError
(EngineCore_DP0 pid=759) Process EngineCore_DP0:
(EngineCore_DP0 pid=759) Traceback (most recent call last):
(EngineCore_DP0 pid=759) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=759) self.run()
(EngineCore_DP0 pid=759) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=759) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 712, in run_engine_core
(EngineCore_DP0 pid=759) raise e
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
(EngineCore_DP0 pid=759) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 498, in __init__
(EngineCore_DP0 pid=759) super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 92, in __init__
(EngineCore_DP0 pid=759) self._initialize_kv_caches(vllm_config)
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 190, in _initialize_kv_caches
(EngineCore_DP0 pid=759) self.model_executor.determine_available_memory())
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 85, in determine_available_memory
(EngineCore_DP0 pid=759) return self.collective_rpc("determine_available_memory")
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
(EngineCore_DP0 pid=759) return [run_method(self.driver_worker, method, args, kwargs)]
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 3122, in run_method
(EngineCore_DP0 pid=759) return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=759) return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 263, in determine_available_memory
(EngineCore_DP0 pid=759) self.model_runner.profile_run()
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3361, in profile_run
(EngineCore_DP0 pid=759) self.model.get_multimodal_embeddings(
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 1381, in get_multimodal_embeddings
(EngineCore_DP0 pid=759) video_embeddings = self._process_video_input(multimodal_input)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 1335, in _process_video_input
(EngineCore_DP0 pid=759) video_embeds = self.visual(pixel_values_videos,
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 517, in forward
(EngineCore_DP0 pid=759) hidden_states = blk(hidden_states,
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 200, in forward
(EngineCore_DP0 pid=759) x = x + self.attn(self.norm1(x),
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2_5_vl.py", line 415, in forward
(EngineCore_DP0 pid=759) output, _ = self.proj(context_layer)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/lora/layers/row_parallel_linear.py", line 70, in forward
(EngineCore_DP0 pid=759) output_parallel = self.apply(input_parallel)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/lora/layers/base_linear.py", line 151, in apply
(EngineCore_DP0 pid=759) torch.Tensor] = self.punica_wrapper.add_lora_linear(
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 215, in add_lora_linear
(EngineCore_DP0 pid=759) self.add_shrink(
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 77, in add_shrink
(EngineCore_DP0 pid=759) lora_shrink(
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1243, in __call__
(EngineCore_DP0 pid=759) return self._op(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=759) return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/lora/ops/triton_ops/lora_shrink_op.py", line 149, in _lora_shrink
(EngineCore_DP0 pid=759) assert token_lora_mapping.size(0) == M
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) AssertionError
[rank0]:[W1015 18:25:16.185357947 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
(APIServer pid=622) Traceback (most recent call last):
(APIServer pid=622) File "<frozen runpy>", line 198, in _run_module_as_main
(APIServer pid=622) File "<frozen runpy>", line 88, in _run_code
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1953, in <module>
(APIServer pid=622) uvloop.run(run_server(args))
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 109, in run
(APIServer pid=622) return __asyncio.run(
(APIServer pid=622) ^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
(APIServer pid=622) return runner.run(main)
(APIServer pid=622) ^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=622) return self._loop.run_until_complete(task)
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 61, in wrapper
(APIServer pid=622) return await main
(APIServer pid=622) ^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
(APIServer pid=622) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
(APIServer pid=622) async with build_async_engine_client(
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=622) return await anext(self.gen)
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
(APIServer pid=622) async with build_async_engine_client_from_engine_args(
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=622) return await anext(self.gen)
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
(APIServer pid=622) async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 1572, in inner
(APIServer pid=622) return fn(*args, **kwargs)
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
(APIServer pid=622) return cls(
(APIServer pid=622) ^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
(APIServer pid=622) self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
(APIServer pid=622) return AsyncMPClient(*client_args)
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 769, in __init__
(APIServer pid=622) super().__init__(
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 448, in __init__
(APIServer pid=622) with launch_core_engines(vllm_config, executor_class,
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
(APIServer pid=622) next(self.gen)
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 732, in launch_core_engines
(APIServer pid=622) wait_for_engine_startup(
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 785, in wait_for_engine_startup
(APIServer pid=622) raise RuntimeError("Engine core initialization failed. "
(APIServer pid=622) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
🐛 Describe the bug
Qwen3-vl-4B-Instruct not working when enable lora
python3 -m vllm.entrypoints.openai.api_server \
--model /models/Qwen3-vl-4B-Instruct/ \
--host 0.0.0.0 \
--port 8000 \
--served-model-name Qwen3-vl-4B-Instruct \
--trust-remote-code \
--gpu-memory-utilization 0.8 \
--max-model-len 8192 \
--api-key sk-xxxxx\
--enable-lora
------------------------------------------------------------------------------------------------
(EngineCore_DP0 pid=759) INFO 10-15 18:25:11 [default_loader.py:267] Loading weights took 1.63 seconds
(EngineCore_DP0 pid=759) WARNING 10-15 18:25:11 [lora_model_runner_mixin.py:42] Regarding multimodal models, vLLM currently only supports adding LoRA to language model.
(EngineCore_DP0 pid=759) INFO 10-15 18:25:11 [punica_selector.py:19] Using PunicaWrapperGPU.
(EngineCore_DP0 pid=759) INFO 10-15 18:25:11 [gpu_model_runner.py:2653] Model loading took 8.6632 GiB and 1.906870 seconds
(EngineCore_DP0 pid=759) INFO 10-15 18:25:11 [gpu_model_runner.py:3344] Encoder cache will be initialized with a budget of 151250 tokens, and profiled with 1 video items of the maximum feature size.
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] EngineCore failed to start.
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] Traceback (most recent call last):
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 498, in __init__
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 92, in __init__
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] self._initialize_kv_caches(vllm_config)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 190, in _initialize_kv_caches
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] self.model_executor.determine_available_memory())
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 85, in determine_available_memory
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self.collective_rpc("determine_available_memory")
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return [run_method(self.driver_worker, method, args, kwargs)]
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 3122, in run_method
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 263, in determine_available_memory
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] self.model_runner.profile_run()
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3361, in profile_run
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] self.model.get_multimodal_embeddings(
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 1381, in get_multimodal_embeddings
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] video_embeddings = self._process_video_input(multimodal_input)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 1335, in _process_video_input
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] video_embeds = self.visual(pixel_values_videos,
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 517, in forward
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] hidden_states = blk(hidden_states,
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 200, in forward
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] x = x + self.attn(self.norm1(x),
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2_5_vl.py", line 415, in forward
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] output, _ = self.proj(context_layer)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/lora/layers/row_parallel_linear.py", line 70, in forward
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] output_parallel = self.apply(input_parallel)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/lora/layers/base_linear.py", line 151, in apply
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] torch.Tensor] = self.punica_wrapper.add_lora_linear(
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 215, in add_lora_linear
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] self.add_shrink(
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 77, in add_shrink
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] lora_shrink(
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1243, in __call__
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return self._op(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] File "/usr/local/lib/python3.12/dist-packages/vllm/lora/ops/triton_ops/lora_shrink_op.py", line 149, in _lora_shrink
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] assert token_lora_mapping.size(0) == M
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) ERROR 10-15 18:25:16 [core.py:708] AssertionError
(EngineCore_DP0 pid=759) Process EngineCore_DP0:
(EngineCore_DP0 pid=759) Traceback (most recent call last):
(EngineCore_DP0 pid=759) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=759) self.run()
(EngineCore_DP0 pid=759) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=759) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 712, in run_engine_core
(EngineCore_DP0 pid=759) raise e
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 699, in run_engine_core
(EngineCore_DP0 pid=759) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 498, in __init__
(EngineCore_DP0 pid=759) super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 92, in __init__
(EngineCore_DP0 pid=759) self._initialize_kv_caches(vllm_config)
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 190, in _initialize_kv_caches
(EngineCore_DP0 pid=759) self.model_executor.determine_available_memory())
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 85, in determine_available_memory
(EngineCore_DP0 pid=759) return self.collective_rpc("determine_available_memory")
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
(EngineCore_DP0 pid=759) return [run_method(self.driver_worker, method, args, kwargs)]
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 3122, in run_method
(EngineCore_DP0 pid=759) return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=759) return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 263, in determine_available_memory
(EngineCore_DP0 pid=759) self.model_runner.profile_run()
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3361, in profile_run
(EngineCore_DP0 pid=759) self.model.get_multimodal_embeddings(
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 1381, in get_multimodal_embeddings
(EngineCore_DP0 pid=759) video_embeddings = self._process_video_input(multimodal_input)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 1335, in _process_video_input
(EngineCore_DP0 pid=759) video_embeds = self.visual(pixel_values_videos,
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 517, in forward
(EngineCore_DP0 pid=759) hidden_states = blk(hidden_states,
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 200, in forward
(EngineCore_DP0 pid=759) x = x + self.attn(self.norm1(x),
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2_5_vl.py", line 415, in forward
(EngineCore_DP0 pid=759) output, _ = self.proj(context_layer)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(EngineCore_DP0 pid=759) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(EngineCore_DP0 pid=759) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/lora/layers/row_parallel_linear.py", line 70, in forward
(EngineCore_DP0 pid=759) output_parallel = self.apply(input_parallel)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/lora/layers/base_linear.py", line 151, in apply
(EngineCore_DP0 pid=759) torch.Tensor] = self.punica_wrapper.add_lora_linear(
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 215, in add_lora_linear
(EngineCore_DP0 pid=759) self.add_shrink(
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/lora/punica_wrapper/punica_gpu.py", line 77, in add_shrink
(EngineCore_DP0 pid=759) lora_shrink(
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1243, in __call__
(EngineCore_DP0 pid=759) return self._op(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(EngineCore_DP0 pid=759) return func(*args, **kwargs)
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) File "/usr/local/lib/python3.12/dist-packages/vllm/lora/ops/triton_ops/lora_shrink_op.py", line 149, in _lora_shrink
(EngineCore_DP0 pid=759) assert token_lora_mapping.size(0) == M
(EngineCore_DP0 pid=759) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=759) AssertionError
[rank0]:[W1015 18:25:16.185357947 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
(APIServer pid=622) Traceback (most recent call last):
(APIServer pid=622) File "<frozen runpy>", line 198, in _run_module_as_main
(APIServer pid=622) File "<frozen runpy>", line 88, in _run_code
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1953, in <module>
(APIServer pid=622) uvloop.run(run_server(args))
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 109, in run
(APIServer pid=622) return __asyncio.run(
(APIServer pid=622) ^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
(APIServer pid=622) return runner.run(main)
(APIServer pid=622) ^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=622) return self._loop.run_until_complete(task)
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 61, in wrapper
(APIServer pid=622) return await main
(APIServer pid=622) ^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1884, in run_server
(APIServer pid=622) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1902, in run_server_worker
(APIServer pid=622) async with build_async_engine_client(
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=622) return await anext(self.gen)
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 180, in build_async_engine_client
(APIServer pid=622) async with build_async_engine_client_from_engine_args(
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=622) return await anext(self.gen)
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 225, in build_async_engine_client_from_engine_args
(APIServer pid=622) async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 1572, in inner
(APIServer pid=622) return fn(*args, **kwargs)
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
(APIServer pid=622) return cls(
(APIServer pid=622) ^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
(APIServer pid=622) self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
(APIServer pid=622) return AsyncMPClient(*client_args)
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 769, in __init__
(APIServer pid=622) super().__init__(
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 448, in __init__
(APIServer pid=622) with launch_core_engines(vllm_config, executor_class,
(APIServer pid=622) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=622) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
(APIServer pid=622) next(self.gen)
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 732, in launch_core_engines
(APIServer pid=622) wait_for_engine_startup(
(APIServer pid=622) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 785, in wait_for_engine_startup
(APIServer pid=622) raise RuntimeError("Engine core initialization failed. "
(APIServer pid=622) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
Before submitting a new issue...
- Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.