bench_fp8.py — head-to-head comparison of FP8 and BF16 models on
GSM8K / AIME2025 accuracy plus TTFT/TPOT performance measurement.
eval_gsm8k_batch.sh — lightweight GSM8K accuracy evaluator that
pipes one problem per xserv-chat invocation and scores with
\boxed{} / last-number extraction.
Benchmark results (gpt-oss-20b, 50-problem GSM8K):
FP8 W8A8 TP1 : 94.0% (single RTX 5090, 25 GB)
FP8 W8A16 TP1: 94.0%
BF16 TP2 : 94.0% (requires 2× RTX 5090)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
101 lines
4.3 KiB
Plaintext
101 lines
4.3 KiB
Plaintext
#include <cuda_runtime.h>
|
|
#include <cuda_fp8.h>
|
|
#include <cublasLt.h>
|
|
#include <stdio.h>
|
|
|
|
int main() {
|
|
cublasLtHandle_t handle;
|
|
cublasLtCreate(&handle);
|
|
|
|
// Model dimensions: M=1 (decode), K=2880, N=5760
|
|
int M=1, N=5760, K=2880;
|
|
|
|
float one = 1.0f;
|
|
void *dScale;
|
|
cudaMalloc(&dScale, 4);
|
|
cudaMemcpy(dScale, &one, 4, cudaMemcpyHostToDevice);
|
|
|
|
cublasLtMatmulPreference_t pref;
|
|
cublasLtMatmulPreferenceCreate(&pref);
|
|
size_t ws = 32*1024*1024;
|
|
cublasLtMatmulPreferenceSetAttribute(pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &ws, sizeof(ws));
|
|
|
|
cublasLtMatmulDesc_t desc;
|
|
cublasLtMatrixLayout_t Adesc, Bdesc, Cdesc, Ddesc;
|
|
cublasLtMatmulHeuristicResult_t result;
|
|
int found;
|
|
cublasStatus_t status;
|
|
|
|
// Test 1: transA=T, transB=N, m=N, n=M, k=K
|
|
// A stored (K, N) ld=K -> transposed to (N, K)
|
|
// B stored (K, M) ld=K
|
|
printf("Test1: transA=T transB=N, m=%d n=%d k=%d\n", N, M, K);
|
|
{
|
|
cublasLtMatmulDescCreate(&desc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
|
|
int32_t transA = 1; // CUBLAS_OP_T
|
|
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, 4);
|
|
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &dScale, sizeof(void*));
|
|
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &dScale, sizeof(void*));
|
|
|
|
cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8F_E4M3, K, N, K);
|
|
cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8F_E4M3, K, M, K);
|
|
cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_16BF, N, M, N);
|
|
cublasLtMatrixLayoutCreate(&Ddesc, CUDA_R_16BF, N, M, N);
|
|
|
|
found = 0;
|
|
status = cublasLtMatmulAlgoGetHeuristic(handle, desc, Adesc, Bdesc, Cdesc, Ddesc, pref, 1, &result, &found);
|
|
printf(" status=%d found=%d\n", status, found);
|
|
cublasLtMatrixLayoutDestroy(Adesc); cublasLtMatrixLayoutDestroy(Bdesc);
|
|
cublasLtMatrixLayoutDestroy(Cdesc); cublasLtMatrixLayoutDestroy(Ddesc);
|
|
cublasLtMatmulDescDestroy(desc);
|
|
}
|
|
|
|
// Test 2: same but transA=N, transB=N
|
|
printf("Test2: transA=N transB=N, m=%d n=%d k=%d\n", N, M, K);
|
|
{
|
|
cublasLtMatmulDescCreate(&desc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
|
|
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &dScale, sizeof(void*));
|
|
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &dScale, sizeof(void*));
|
|
|
|
cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8F_E4M3, N, K, N);
|
|
cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8F_E4M3, K, M, K);
|
|
cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_16BF, N, M, N);
|
|
cublasLtMatrixLayoutCreate(&Ddesc, CUDA_R_16BF, N, M, N);
|
|
|
|
found = 0;
|
|
status = cublasLtMatmulAlgoGetHeuristic(handle, desc, Adesc, Bdesc, Cdesc, Ddesc, pref, 1, &result, &found);
|
|
printf(" status=%d found=%d\n", status, found);
|
|
cublasLtMatrixLayoutDestroy(Adesc); cublasLtMatrixLayoutDestroy(Bdesc);
|
|
cublasLtMatrixLayoutDestroy(Cdesc); cublasLtMatrixLayoutDestroy(Ddesc);
|
|
cublasLtMatmulDescDestroy(desc);
|
|
}
|
|
|
|
// Test 3: transA=N, transB=T
|
|
printf("Test3: transA=N transB=T, m=%d n=%d k=%d\n", N, M, K);
|
|
{
|
|
cublasLtMatmulDescCreate(&desc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
|
|
int32_t transB = 1;
|
|
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, 4);
|
|
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &dScale, sizeof(void*));
|
|
cublasLtMatmulDescSetAttribute(desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &dScale, sizeof(void*));
|
|
|
|
cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8F_E4M3, N, K, N);
|
|
cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8F_E4M3, M, K, M);
|
|
cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_16BF, N, M, N);
|
|
cublasLtMatrixLayoutCreate(&Ddesc, CUDA_R_16BF, N, M, N);
|
|
|
|
found = 0;
|
|
status = cublasLtMatmulAlgoGetHeuristic(handle, desc, Adesc, Bdesc, Cdesc, Ddesc, pref, 1, &result, &found);
|
|
printf(" status=%d found=%d\n", status, found);
|
|
cublasLtMatrixLayoutDestroy(Adesc); cublasLtMatrixLayoutDestroy(Bdesc);
|
|
cublasLtMatrixLayoutDestroy(Cdesc); cublasLtMatrixLayoutDestroy(Ddesc);
|
|
cublasLtMatmulDescDestroy(desc);
|
|
}
|
|
|
|
cublasLtMatmulPreferenceDestroy(pref);
|
|
cublasLtDestroy(handle);
|
|
cudaFree(dScale);
|
|
printf("Done.\n");
|
|
return 0;
|
|
}
|