Files
xserv/tools/test_fp8_gemm.cu
Gahow Wang 3a530956af tools: add FP8 vs BF16 benchmark and GSM8K eval harness
bench_fp8.py — head-to-head comparison of FP8 and BF16 models on
  GSM8K / AIME2025 accuracy plus TTFT/TPOT performance measurement.

eval_gsm8k_batch.sh — lightweight GSM8K accuracy evaluator that
  pipes one problem per xserv-chat invocation and scores with
  \boxed{} / last-number extraction.

Benchmark results (gpt-oss-20b, 50-problem GSM8K):
  FP8 W8A8 TP1 : 94.0%  (single RTX 5090, 25 GB)
  FP8 W8A16 TP1: 94.0%
  BF16 TP2     : 94.0%  (requires 2× RTX 5090)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-06-08 15:43:04 +08:00

101 lines
4.3 KiB
Plaintext

#include <cuda_runtime.h>
#include <cuda_fp8.h>
#include <cublasLt.h>
#include <stdio.h>
int main() {
cublasLtHandle_t handle;
cublasLtCreate(&handle);
// Model dimensions: M=1 (decode), K=2880, N=5760
int M=1, N=5760, K=2880;
float one = 1.0f;
void *dScale;
cudaMalloc(&dScale, 4);
cudaMemcpy(dScale, &one, 4, cudaMemcpyHostToDevice);
cublasLtMatmulPreference_t pref;
cublasLtMatmulPreferenceCreate(&pref);
size_t ws = 32*1024*1024;
cublasLtMatmulPreferenceSetAttribute(pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &ws, sizeof(ws));
cublasLtMatmulDesc_t desc;
cublasLtMatrixLayout_t Adesc, Bdesc, Cdesc, Ddesc;
cublasLtMatmulHeuristicResult_t result;
int found;
cublasStatus_t status;
// Test 1: transA=T, transB=N, m=N, n=M, k=K
// A stored (K, N) ld=K -> transposed to (N, K)
// B stored (K, M) ld=K
printf("Test1: transA=T transB=N, m=%d n=%d k=%d\n", N, M, K);
{
cublasLtMatmulDescCreate(&desc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
int32_t transA = 1; // CUBLAS_OP_T
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, 4);
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &dScale, sizeof(void*));
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &dScale, sizeof(void*));
cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8F_E4M3, K, N, K);
cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8F_E4M3, K, M, K);
cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_16BF, N, M, N);
cublasLtMatrixLayoutCreate(&Ddesc, CUDA_R_16BF, N, M, N);
found = 0;
status = cublasLtMatmulAlgoGetHeuristic(handle, desc, Adesc, Bdesc, Cdesc, Ddesc, pref, 1, &result, &found);
printf(" status=%d found=%d\n", status, found);
cublasLtMatrixLayoutDestroy(Adesc); cublasLtMatrixLayoutDestroy(Bdesc);
cublasLtMatrixLayoutDestroy(Cdesc); cublasLtMatrixLayoutDestroy(Ddesc);
cublasLtMatmulDescDestroy(desc);
}
// Test 2: same but transA=N, transB=N
printf("Test2: transA=N transB=N, m=%d n=%d k=%d\n", N, M, K);
{
cublasLtMatmulDescCreate(&desc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &dScale, sizeof(void*));
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &dScale, sizeof(void*));
cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8F_E4M3, N, K, N);
cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8F_E4M3, K, M, K);
cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_16BF, N, M, N);
cublasLtMatrixLayoutCreate(&Ddesc, CUDA_R_16BF, N, M, N);
found = 0;
status = cublasLtMatmulAlgoGetHeuristic(handle, desc, Adesc, Bdesc, Cdesc, Ddesc, pref, 1, &result, &found);
printf(" status=%d found=%d\n", status, found);
cublasLtMatrixLayoutDestroy(Adesc); cublasLtMatrixLayoutDestroy(Bdesc);
cublasLtMatrixLayoutDestroy(Cdesc); cublasLtMatrixLayoutDestroy(Ddesc);
cublasLtMatmulDescDestroy(desc);
}
// Test 3: transA=N, transB=T
printf("Test3: transA=N transB=T, m=%d n=%d k=%d\n", N, M, K);
{
cublasLtMatmulDescCreate(&desc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
int32_t transB = 1;
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, 4);
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &dScale, sizeof(void*));
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &dScale, sizeof(void*));
cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8F_E4M3, N, K, N);
cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8F_E4M3, M, K, M);
cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_16BF, N, M, N);
cublasLtMatrixLayoutCreate(&Ddesc, CUDA_R_16BF, N, M, N);
found = 0;
status = cublasLtMatmulAlgoGetHeuristic(handle, desc, Adesc, Bdesc, Cdesc, Ddesc, pref, 1, &result, &found);
printf(" status=%d found=%d\n", status, found);
cublasLtMatrixLayoutDestroy(Adesc); cublasLtMatrixLayoutDestroy(Bdesc);
cublasLtMatrixLayoutDestroy(Cdesc); cublasLtMatrixLayoutDestroy(Ddesc);
cublasLtMatmulDescDestroy(desc);
}
cublasLtMatmulPreferenceDestroy(pref);
cublasLtDestroy(handle);
cudaFree(dScale);
printf("Done.\n");
return 0;
}