Files
xserv/tools/setup-llama-cpp.sh
Gahow Wang 49c7653222 tools: add llama.cpp comparison baseline + standard benchmark suite
Vendor llama.cpp as a submodule pinned to b9371 and add a one-click
benchmark driver that compares xserv against it on identical workloads:

- setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh
  converts the same safetensors to BF16 GGUF for an apples-to-apples baseline.
- tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput
  (single-stream + concurrent) and response quality on AIME 2025 + GSM8K.
- fetch_datasets.py pulls datasets to local JSON (GPU host has no network);
  task loaders prefer the local JSON.
- sync-and-build.sh: `bench` subcommand transfers source + datasets to the
  GPU host via tar-over-ssh (no rsync there), builds, and runs the suite.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 11:18:52 +08:00

95 lines
3.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# Build the llama.cpp baseline (third_party/llama.cpp) with CUDA.
#
# Source is vendored as a git submodule pinned to a fixed tag (see .gitmodules
# and the recorded gitlink commit). This script does NOT fetch from the network
# by default — it expects the source to already be present, either via:
# - `git submodule update --init` (on a host with network), or
# - rsync/tar transfer (how it reaches dash5, which has no network).
#
# It only fetches as a convenience fallback when the source is missing AND
# network is reachable.
#
# Idempotent. Safe to re-run.
#
# Usage:
# tools/setup-llama-cpp.sh # build (configure if needed)
# tools/setup-llama-cpp.sh --rebuild # wipe build dir, reconfigure, rebuild
#
# Env:
# CUDA_ARCH CUDA architectures for cmake (default 120-real = RTX 5090 SM120)
# CUDA_HOME CUDA toolkit root (auto-detected: /usr/local/cuda-12.9 then cuda)
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
VENDOR_DIR="$ROOT_DIR/third_party/llama.cpp"
CUDA_ARCH="${CUDA_ARCH:-120-real}"
REBUILD=0
for arg in "$@"; do
case "$arg" in
--rebuild) REBUILD=1 ;;
--help|-h) grep -E '^#' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
esac
done
if [ -d /usr/local/cuda-12.9 ]; then
export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.9}"
elif [ -d /usr/local/cuda ]; then
export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
fi
[ -n "${CUDA_HOME:-}" ] && export PATH="$CUDA_HOME/bin:$PATH"
echo "=== llama.cpp build ==="
echo " vendor dir : $VENDOR_DIR"
echo " CUDA arch : $CUDA_ARCH"
echo " CUDA_HOME : ${CUDA_HOME:-<not set>}"
# --- Ensure source is present ---
if [ ! -f "$VENDOR_DIR/CMakeLists.txt" ]; then
echo "==> source missing at $VENDOR_DIR"
if git -C "$ROOT_DIR" rev-parse --git-dir >/dev/null 2>&1 \
&& timeout 8 git ls-remote https://github.com/ggerganov/llama.cpp HEAD >/dev/null 2>&1; then
echo "==> network OK, initializing submodule"
git -C "$ROOT_DIR" submodule update --init --recursive third_party/llama.cpp
else
echo "ERROR: llama.cpp source not present and network unavailable." >&2
echo " On a networked host run: git submodule update --init third_party/llama.cpp" >&2
echo " Then transfer the source here (the bench tooling does this via rsync)." >&2
exit 1
fi
fi
BUILD_DIR="$VENDOR_DIR/build"
if [ "$REBUILD" -eq 1 ] && [ -d "$BUILD_DIR" ]; then
echo "==> --rebuild: removing $BUILD_DIR"
rm -rf "$BUILD_DIR"
fi
SERVER_BIN="$BUILD_DIR/bin/llama-server"
if [ -x "$SERVER_BIN" ] && [ "$REBUILD" -eq 0 ]; then
echo "==> already built: $SERVER_BIN (use --rebuild to force)"
echo "$SERVER_BIN"
exit 0
fi
echo "==> cmake configure"
cmake -S "$VENDOR_DIR" -B "$BUILD_DIR" \
-DGGML_CUDA=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CUDA_ARCHITECTURES="$CUDA_ARCH"
echo "==> build llama-server llama-cli (jobs: $(nproc))"
cmake --build "$BUILD_DIR" --target llama-server llama-cli -j "$(nproc)"
if [ ! -x "$SERVER_BIN" ]; then
echo "ERROR: llama-server did not build at $SERVER_BIN" >&2
exit 1
fi
echo "=== done ==="
echo "$SERVER_BIN"