## vLLM DBO 代码结构 ```python class UBatchWrapper: def __init__( self, runnable: Callable, vllm_config: VllmConfig, runtime_mode: CUDAGraphMode, device: torch.cuda.device, ): self.runnable = runnable self.vllm_config = vllm_config self.compilation_config = vllm_config.compilation_config self.comm_stream = torch.cuda.Stream(device=device) # Two ubatch threads plus the main thread self.ready_barrier = threading.Barrier(3) self.cudagraphs: dict[int, CUDAGraphMetaData] = {} self.cudagraph_wrapper = None self.graph_pool = None if runtime_mode is not CUDAGraphMode.NONE: self.cudagraph_wrapper = CUDAGraphWrapper( runnable, vllm_config, runtime_mode=runtime_mode ) self.graph_pool = current_platform.get_global_graph_pool() self.sm_control = self._create_sm_control_context(vllm_config) self.device = device ``` https://github.com/vllm-project/vllm-ascend/issues/2599 in vllm, we can search PR for `[Core/DBO]` ## notes > Could you provide some performance improvement data? I tested DeepSeek V2 Lite locally and observed a negative performance gain, with the per-step latency increasing from 38ms to 49ms. The process of launching vLLM and the test results are shown below. > According to the Nsys profile data, after enabling DBO, the execution time of both kernel batched_triton_kerneland vllm::act_and_mul_kernelhas increased significantly. Yes this is expected; DBO will increase the GEMM time when running a memory bound workload since the full model weights will have to be loaded twice (once for each microbatch). So DBO is only really beneficial when the communication time is >1x GEMM time; so it's really only intended to be used in multi-node EP setup where the communications costs are much higher. Its not expected to provide speed-up in a single node environment. # kernels ``` ampere_bf16_s16816gemm_bf16_128x128_ldg8_f2f_stages_64x3_tn ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_stages_64x4_tn ampere_bf16_s16816gemm_bf16_64x64_sliced1x2_ldg8_f2f_stages_64x5_tn ampere_bf16_s16816gemm_bf16_64x64_sliced1x2_ldg8_f2f_stages_64x6_tn fused_moe_kernel std::enable_if(int)0&&vllm::_typeConvert::exists, void>::type vllm::fused_add_rms_norm_kernel(T1 *, T1 *, const T1 *, float, int, int) void at::native::::cunn_SoftMaxForward<(int)4, float, float, float, at::native::::SoftMaxForwardEpilogue>(T4 *, const T2 *, int) void at::native::::distribution_elementwise_grid_stride_kernel(at::TensorIteratorBase &, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T3, T4)::[lambda(curandStatePhilox4_32_10 *) (instance 2)], void at::native::::distribution_nullary_kernel(at::TensorIteratorBase &, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T3, T4)::[lambda(curandStatePhilox4_32_10 *) (instance 2)], void at::native::templates::cuda::exponential_kernel(at::TensorIteratorBase &, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T4, const T5 &, T6)::[lambda(int, float) (instance 1)]>(long, at::PhiloxCudaState, T3, T4) void at::native::::indexSelectLargeIndex(at::cuda::detail::TensorInfo, at::cuda::detail::TensorInfo, at::cuda::detail::TensorInfo, int, int, T3, T3, long) void at::native::::indexSelectSmallIndex(at::cuda::detail::TensorInfo, at::cuda::detail::TensorInfo, at::cuda::detail::TensorInfo, int, int, T3, long) void at::native::elementwise_kernel<(int)128, (int)2, void at::native::gpu_kernel_impl_nocast>>(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3) void at::native::elementwise_kernel<(int)128, (int)2, void at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3) void at::native::elementwise_kernel<(int)128, (int)4, void at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3) void at::native::index_elementwise_kernel<(int)128, (int)4, void at::native::gpu_index_kernel>(at::TensorIteratorBase &, c10::ArrayRef, c10::ArrayRef)::[lambda(char *, const char *, long) (instance 1)]>(at::TensorIteratorBase &, c10::ArrayRef, c10::ArrayRef, const T1 &)::[lambda(int) (instance 1)]>(long, T3) void at::native::reduce_kernel<(int)128, (int)4, at::native::ReduceOp::operator ()(at::TensorIterator &)::[lambda(float, float) (instance 1)]>, unsigned int, c10::BFloat16, (int)4, (int)4>>(T3) void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp, unsigned int, long, (int)4, (int)4>>(T3) void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp::operator ()(at::TensorIterator &)::[lambda(float, float) (instance 1)]>, unsigned int, float, (int)4, (int)4>>(T3) void at::native::unrolled_elementwise_kernel, std::array, (int)8, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, T1, T2, T4, T5, T6, T7) void at::native::unrolled_elementwise_kernel, std::array, (int)8, TrivialOffsetCalculator<(int)0, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, T1, T2, T4, T5, T6, T7) void at::native::unrolled_elementwise_kernel, std::array, (int)8, TrivialOffsetCalculator<(int)0, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, T1, T2, T4, T5, T6, T7) void at::native::unrolled_elementwise_kernel, (int)8, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithCast<(int)1>, at::native::memory::StoreWithCast<(int)1>>(int, T1, T2, T4, T5, T6, T7) void at::native::unrolled_elementwise_kernel, (int)8, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithCast<(int)1>, at::native::memory::StoreWithCast<(int)1>>(int, T1, T2, T4, T5, T6, T7) void at::native::unrolled_elementwise_kernel, (int)8, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithCast<(int)1>, at::native::memory::StoreWithCast<(int)1>>(int, T1, T2, T4, T5, T6, T7) void at::native::vectorized_elementwise_kernel<(int)2, at::native::FillFunctor, std::array>(int, T2, T3) void at::native::vectorized_elementwise_kernel<(int)2, at::native::FillFunctor, std::array>(int, T2, T3) void at::native::vectorized_elementwise_kernel<(int)4, at::native::BinaryFunctor>, std::array>(int, T2, T3) void at::native::vectorized_elementwise_kernel<(int)4, at::native::FillFunctor, std::array>(int, T2, T3) void at::native::vectorized_elementwise_kernel<(int)8, at::native::FillFunctor, std::array>(int, T2, T3) void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, (bool)0, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, (bool)1, (bool)0, (bool)0>(cublasLt::cublasSplitKParams, const T4 *, const T9 *, T8 *, T5 *, const T6 *, const T6 *, const T10 *, const T4 *, T10 *, void *, long, T6 *, int *, T6 *, const T6 *, const T6 *, const T6 *, const T6 *) void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, (bool)0, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, (bool)1, (bool)0, (bool)0>(cublasLt::cublasSplitKParams, const T4 *, const T9 *, T8 *, T5 *, const T6 *, const T6 *, const T10 *, const T4 *, T10 *, void *, long, T6 *, int *, T6 *, const T6 *, const T6 *, const T6 *, const T6 *) void cutlass::Kernel2(T1::Params) void cutlass::Kernel2(T1::Params) void cutlass::Kernel2(T1::Params) void cutlass::Kernel2(T1::Params) void flash::flash_fwd_splitkv_combine_kernel>, (int)4, (int)1, (bool)1>(flash::Flash_fwd_params) void flash::flash_fwd_splitkv_kernel>, (bool)0, (bool)0, (bool)0, (bool)0, (bool)1, (bool)0, (bool)0, (bool)0>(flash::Flash_fwd_params) void flash::flash_fwd_splitkv_kernel>, (bool)0, (bool)0, (bool)0, (bool)0, (bool)1, (bool)0, (bool)1, (bool)0>(flash::Flash_fwd_params) void flash::flash_fwd_splitkv_kernel>, (bool)1, (bool)0, (bool)0, (bool)0, (bool)1, (bool)0, (bool)0, (bool)0>(flash::Flash_fwd_params) void vllm::act_and_mul_kernel, (bool)1>(T1 *, const T1 *, int) void vllm::moe::count_and_sort_expert_tokens_kernel(const T1 *, int *, int *, unsigned long) void vllm::moe::moe_align_block_size_kernel(const T1 *, int *, int *, int *, int, int, int, int, unsigned long, int *) void vllm::moe::topkGatingSoftmax<(int)4, (int)128, (int)4, (int)16, int>(const float *, const bool *, float *, int, T5 *, int *, int, int, int) void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0>(const T1 *, const T1 *, T2 *, T2 *, const long *, long, long, long, long, long, int, int, int, const float *, const float *) void vllm::rms_norm_kernel(T1 *, const T1 *, const T1 *, float, int, int) void vllm::rotary_embedding_kernel(const long *, T1 *, T1 *, const T1 *, int, long, long, long, int, int, int) ```