# For ali vLLM - 使用 `https://code.alibaba-inc.com/algo/llm_scripts/tree/main/vllm_v1/pd_ep_qwen_dlc_mpirun` 的启动脚本 - 修改 vllm 源码的 check 部分使用 BLADNN_ATTN - 关闭 eager 避免 build 超时 - install 最新的 GEMM 支持 FP8 `['BLADNN_ATTN', 'FLASH_ATTN', 'TRITON_ATTN', 'XFORMERS', 'ROCM_ATTN', 'ROCM_AITER_MLA', 'ROCM_AITER_FA', 'TORCH_SDPA', 'FLASHINFER', 'FLASHINFER_MLA', 'TRITON_MLA', 'CUTLASS_MLA', 'FLASHMLA', 'FLASHMLA_SPARSE', 'FLASH_ATTN_MLA', 'PALLAS', 'IPEX', 'DUAL_CHUNK_FLASH_ATTN', 'SPARSE_FLASH_ATTN', 'NO_ATTENTION', 'FLEX_ATTENTION', 'TREE_ATTN', 'ROCM_AITER_UNIFIED_ATTN']` dashllm:deepep_cp312_test_v1_deepep_274 # Nsys ``` nsys profile -o candle_trace --trace=cuda,nvtx \ cargo run --features cuda --release --example qwen -- --prompt "Hello there " --tokenizer-file /mnt/debugger/wjh/models/Qwen2-7B/tokenizer.json --weight-files /mnt/debugger/wjh/models/Qwen2-7B/model-00001-of-00004.safetensors,/mnt/debugger/wjh/models/Qwen2-7B/model-00002-of-00004.safetensors,/mnt/debugger/wjh/models/Qwen2-7B/model-00003-of-00004.safetensors,/mnt/debugger/wjh/models/Qwen2-7B/model-00004-of-00004.safetensors --model 2-7b ``` embedding: 满足 `abs(candle[i] - torch[i]) < 1e-7` ``` Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name -------- --------------- --------- -------- -------- -------- -------- ----------- ---------------------------------------------------------------------------------------------------- 60.2 44635338548 371520 120142.5 101728.0 33984 688448 49045.5 fused_moe_kernel 5.0 3690934670 182880 20182.3 20160.0 19392 27424 244.2 void cutlass::Kernel2(T1::Par… 4.2 3094961330 182880 16923.5 16800.0 15936 22688 672.4 ampere_bf16_s16816gemm_bf16_64x64_sliced1x2_ldg8_f2f_stages_64x6_tn 3.8 2842715582 737280 3855.7 3904.0 3328 9696 190.2 void at::native::elementwise_kernel<(int)128, (int)4, void at::native::gpu_kernel_impl_nocast(T1::Params) 2.0 1468689277 185760 7906.4 7872.0 7775 11040 126.8 void vllm::moe::moe_align_block_size_kernel(const T1 *, int *, int *, int *, int, int, int, in… 1.9 1409326000 5310 265409.8 356352.0 16320 367328 151370.8 ampere_bf16_s16816gemm_bf16_64x64_sliced1x2_ldg8_f2f_stages_64x5_tn 1.9 1382453159 185760 7442.1 7392.0 6240 13472 369.3 void at::native::reduce_kernel<(int)128, (int)4, at::native::ReduceOp(int)0&&vllm::_typeConvert::exists, void>::type vllm::fused_add_rms_norm_kern… 1.8 1360460198 189630 7174.3 7008.0 5504 17856 718.6 void at::native::unrolled_elementwise_kernel(T1 *, const T1 *, const T1 *, float, int, int) 1.4 1018883192 185760 5484.9 5472.0 5024 8000 61.5 void vllm::moe::topkGatingSoftmax<(int)4, (int)128, (int)4, (int)16, int>(const float *, const bool… 1.2 860705869 561117 1533.9 1376.0 1247 2688 261.6 void at::native::vectorized_elementwise_kernel<(int)4, at::native::FillFunctor, std::array(const long *, T1 *, T1 *, const T1 *, in… 1.0 756608821 185760 4073.0 4064.0 3840 6048 104.7 void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp, (bool)1>(T1 *, cons… 0.7 537884109 185760 2895.6 2880.0 2752 3968 45.8 void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, (bool)0, __n… 0.7 494377950 185760 2661.4 2656.0 2592 4672 104.9 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0… 0.6 460223727 182880 2516.5 2592.0 2208 3264 131.7 void at::native::elementwise_kernel<(int)128, (int)2, void at::native::gpu_kernel_impl_nocast, std:… 0.4 328896261 185760 1770.5 1760.0 1568 3584 131.0 void vllm::moe::count_and_sort_expert_tokens_kernel(const T1 *, int *, int *, unsigned long) 0.4 263914824 46080 5727.3 5696.0 5440 8320 81.1 void flash::flash_fwd_splitkv_combine_kernel::cunn_SoftMaxForward<(int)4, float, float, float, at::native:::… 0.1 79960104 2880 27763.9 30464.0 23712 38752 3833.5 ampere_bf16_s16816gemm_bf16_128x128_ldg8_f2f_stages_64x3_tn 0.1 51639494 3870 13343.5 13344.0 10240 16704 288.1 void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp::distribution_elementwise_grid_stride_kernel::indexSelectSmallIndex(T1::Param… 0.0 12874940 2880 4470.5 4512.0 3199 7136 1228.3 void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, (boo… 0.0 8327452 1440 5783.0 5760.0 5536 7200 218.4 void cutlass::Kernel2(T1::Params) 0.0 8267196 3870 2136.2 2144.0 1920 2912 63.2 void at::native::unrolled_elementwise_kernel, std::array, std::array, std::array::indexSelectLargeIndex, std::array, std::array(int)0&&vllm::_typeConvert::exists, void>::type vllm::fused_add_rms_norm_kernel(T1 *, T1 *, const T1 *, float, int, int) 7: void at::native::::cunn_SoftMaxForward<(int)4, float, float, float, at::native::::SoftMaxForwardEpilogue>(T4 *, const T2 *, int) 8: void at::native::::distribution_elementwise_grid_stride_kernel(at::TensorIteratorBase &, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T3, T4)::[lambda(curandStatePhilox4_32_10 *) (instance 2)], void at::native::::distribution_nullary_kernel(at::TensorIteratorBase &, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T3, T4)::[lambda(curandStatePhilox4_32_10 *) (instance 2)], void at::native::templates::cuda::exponential_kernel(at::TensorIteratorBase &, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 2)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T4, const T5 &, T6)::[lambda(int, float) (instance 1)]>(long, at::PhiloxCudaState, T3, T4) 9: void at::native::::indexSelectLargeIndex(at::cuda::detail::TensorInfo, at::cuda::detail::TensorInfo, at::cuda::detail::TensorInfo, int, int, T3, T3, long) 10: void at::native::::indexSelectSmallIndex(at::cuda::detail::TensorInfo, at::cuda::detail::TensorInfo, at::cuda::detail::TensorInfo, int, int, T3, long) 11: void at::native::elementwise_kernel<(int)128, (int)2, void at::native::gpu_kernel_impl_nocast>>(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3) 12: void at::native::elementwise_kernel<(int)128, (int)2, void at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3) 13: void at::native::elementwise_kernel<(int)128, (int)4, void at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase &, const T1 &)::[lambda(int) (instance 1)]>(int, T3) 14: void at::native::index_elementwise_kernel<(int)128, (int)4, void at::native::gpu_index_kernel>(at::TensorIteratorBase &, c10::ArrayRef, c10::ArrayRef)::[lambda(char *, const char *, long) (instance 1)]>(at::TensorIteratorBase &, c10::ArrayRef, c10::ArrayRef, const T1 &)::[lambda(int) (instance 1)]>(long, T3) 15: void at::native::reduce_kernel<(int)128, (int)4, at::native::ReduceOp::operator ()(at::TensorIterator &)::[lambda(float, float) (instance 1)]>, unsigned int, c10::BFloat16, (int)4, (int)4>>(T3) 16: void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp, unsigned int, long, (int)4, (int)4>>(T3) 17: void at::native::reduce_kernel<(int)512, (int)1, at::native::ReduceOp::operator ()(at::TensorIterator &)::[lambda(float, float) (instance 1)]>, unsigned int, float, (int)4, (int)4>>(T3) 18: void at::native::unrolled_elementwise_kernel, std::array, (int)8, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, T1, T2, T4, T5, T6, T7) 19: void at::native::unrolled_elementwise_kernel, std::array, (int)8, TrivialOffsetCalculator<(int)0, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, T1, T2, T4, T5, T6, T7) 20: void at::native::unrolled_elementwise_kernel, std::array, (int)8, TrivialOffsetCalculator<(int)0, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithoutCast, at::native::memory::StoreWithoutCast>(int, T1, T2, T4, T5, T6, T7) 21: void at::native::unrolled_elementwise_kernel, (int)8, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithCast<(int)1>, at::native::memory::StoreWithCast<(int)1>>(int, T1, T2, T4, T5, T6, T7) 22: void at::native::unrolled_elementwise_kernel, (int)8, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithCast<(int)1>, at::native::memory::StoreWithCast<(int)1>>(int, T1, T2, T4, T5, T6, T7) 23: void at::native::unrolled_elementwise_kernel, (int)8, TrivialOffsetCalculator<(int)1, unsigned int>, TrivialOffsetCalculator<(int)1, unsigned int>, at::native::memory::LoadWithCast<(int)1>, at::native::memory::StoreWithCast<(int)1>>(int, T1, T2, T4, T5, T6, T7) 24: void at::native::vectorized_elementwise_kernel<(int)2, at::native::FillFunctor, std::array>(int, T2, T3) 25: void at::native::vectorized_elementwise_kernel<(int)2, at::native::FillFunctor, std::array>(int, T2, T3) 26: void at::native::vectorized_elementwise_kernel<(int)4, at::native::BinaryFunctor>, std::array>(int, T2, T3) 27: void at::native::vectorized_elementwise_kernel<(int)4, at::native::FillFunctor, std::array>(int, T2, T3) 28: void at::native::vectorized_elementwise_kernel<(int)8, at::native::FillFunctor, std::array>(int, T2, T3) 29: void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, __nv_bfloat16, __nv_bfloat16, float, (bool)0, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, (bool)1, (bool)0, (bool)0>(cublasLt::cublasSplitKParams, const T4 *, const T9 *, T8 *, T5 *, const T6 *, const T6 *, const T10 *, const T4 *, T10 *, void *, long, T6 *, int *, T6 *, const T6 *, const T6 *, const T6 *, const T6 *) 30: void cublasLt::splitKreduce_kernel<(int)32, (int)16, int, float, __nv_bfloat16, float, (bool)0, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16, (bool)1, (bool)0, (bool)0>(cublasLt::cublasSplitKParams, const T4 *, const T9 *, T8 *, T5 *, const T6 *, const T6 *, const T10 *, const T4 *, T10 *, void *, long, T6 *, int *, T6 *, const T6 *, const T6 *, const T6 *, const T6 *) 31: void cutlass::Kernel2(T1::Params) 32: void cutlass::Kernel2(T1::Params) 33: void cutlass::Kernel2(T1::Params) 34: void cutlass::Kernel2(T1::Params) 35: void flash::flash_fwd_splitkv_combine_kernel>, (int)4, (int)1, (bool)1>(flash::Flash_fwd_params) 36: void flash::flash_fwd_splitkv_kernel>, (bool)0, (bool)0, (bool)0, (bool)0, (bool)1, (bool)0, (bool)0, (bool)0>(flash::Flash_fwd_params) 37: void flash::flash_fwd_splitkv_kernel>, (bool)0, (bool)0, (bool)0, (bool)0, (bool)1, (bool)0, (bool)1, (bool)0>(flash::Flash_fwd_params) 38: void flash::flash_fwd_splitkv_kernel>, (bool)1, (bool)0, (bool)0, (bool)0, (bool)1, (bool)0, (bool)0, (bool)0>(flash::Flash_fwd_params) 39: void vllm::act_and_mul_kernel, (bool)1>(T1 *, const T1 *, int) 40: void vllm::moe::count_and_sort_expert_tokens_kernel(const T1 *, int *, int *, unsigned long) 41: void vllm::moe::moe_align_block_size_kernel(const T1 *, int *, int *, int *, int, int, int, int, unsigned long, int *) 42: void vllm::moe::topkGatingSoftmax<(int)4, (int)128, (int)4, (int)16, int>(const float *, const bool *, float *, int, T5 *, int *, int, int, int) 43: void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0>(const T1 *, const T1 *, T2 *, T2 *, const long *, long, long, long, long, long, int, int, int, const float *, const float *) 44: void vllm::rms_norm_kernel(T1 *, const T1 *, const T1 *, float, int, int) 45: void vllm::rotary_embedding_kernel(const long *, T1 *, T1 *, const T1 *, int, long, long, long, int, int, int) ```