Vendor llama.cpp as a submodule pinned to b9371 and add a one-click benchmark driver that compares xserv against it on identical workloads: - setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh converts the same safetensors to BF16 GGUF for an apples-to-apples baseline. - tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput (single-stream + concurrent) and response quality on AIME 2025 + GSM8K. - fetch_datasets.py pulls datasets to local JSON (GPU host has no network); task loaders prefer the local JSON. - sync-and-build.sh: `bench` subcommand transfers source + datasets to the GPU host via tar-over-ssh (no rsync there), builds, and runs the suite. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
95 lines
3.2 KiB
Bash
Executable File
95 lines
3.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Build the llama.cpp baseline (third_party/llama.cpp) with CUDA.
|
|
#
|
|
# Source is vendored as a git submodule pinned to a fixed tag (see .gitmodules
|
|
# and the recorded gitlink commit). This script does NOT fetch from the network
|
|
# by default — it expects the source to already be present, either via:
|
|
# - `git submodule update --init` (on a host with network), or
|
|
# - rsync/tar transfer (how it reaches dash5, which has no network).
|
|
#
|
|
# It only fetches as a convenience fallback when the source is missing AND
|
|
# network is reachable.
|
|
#
|
|
# Idempotent. Safe to re-run.
|
|
#
|
|
# Usage:
|
|
# tools/setup-llama-cpp.sh # build (configure if needed)
|
|
# tools/setup-llama-cpp.sh --rebuild # wipe build dir, reconfigure, rebuild
|
|
#
|
|
# Env:
|
|
# CUDA_ARCH CUDA architectures for cmake (default 120-real = RTX 5090 SM120)
|
|
# CUDA_HOME CUDA toolkit root (auto-detected: /usr/local/cuda-12.9 then cuda)
|
|
|
|
set -euo pipefail
|
|
|
|
ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
|
|
VENDOR_DIR="$ROOT_DIR/third_party/llama.cpp"
|
|
CUDA_ARCH="${CUDA_ARCH:-120-real}"
|
|
REBUILD=0
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--rebuild) REBUILD=1 ;;
|
|
--help|-h) grep -E '^#' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
|
|
esac
|
|
done
|
|
|
|
if [ -d /usr/local/cuda-12.9 ]; then
|
|
export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.9}"
|
|
elif [ -d /usr/local/cuda ]; then
|
|
export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
|
|
fi
|
|
[ -n "${CUDA_HOME:-}" ] && export PATH="$CUDA_HOME/bin:$PATH"
|
|
|
|
echo "=== llama.cpp build ==="
|
|
echo " vendor dir : $VENDOR_DIR"
|
|
echo " CUDA arch : $CUDA_ARCH"
|
|
echo " CUDA_HOME : ${CUDA_HOME:-<not set>}"
|
|
|
|
# --- Ensure source is present ---
|
|
if [ ! -f "$VENDOR_DIR/CMakeLists.txt" ]; then
|
|
echo "==> source missing at $VENDOR_DIR"
|
|
if git -C "$ROOT_DIR" rev-parse --git-dir >/dev/null 2>&1 \
|
|
&& timeout 8 git ls-remote https://github.com/ggerganov/llama.cpp HEAD >/dev/null 2>&1; then
|
|
echo "==> network OK, initializing submodule"
|
|
git -C "$ROOT_DIR" submodule update --init --recursive third_party/llama.cpp
|
|
else
|
|
echo "ERROR: llama.cpp source not present and network unavailable." >&2
|
|
echo " On a networked host run: git submodule update --init third_party/llama.cpp" >&2
|
|
echo " Then transfer the source here (the bench tooling does this via rsync)." >&2
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
BUILD_DIR="$VENDOR_DIR/build"
|
|
if [ "$REBUILD" -eq 1 ] && [ -d "$BUILD_DIR" ]; then
|
|
echo "==> --rebuild: removing $BUILD_DIR"
|
|
rm -rf "$BUILD_DIR"
|
|
fi
|
|
|
|
SERVER_BIN="$BUILD_DIR/bin/llama-server"
|
|
if [ -x "$SERVER_BIN" ] && [ "$REBUILD" -eq 0 ]; then
|
|
echo "==> already built: $SERVER_BIN (use --rebuild to force)"
|
|
echo "$SERVER_BIN"
|
|
exit 0
|
|
fi
|
|
|
|
echo "==> cmake configure"
|
|
cmake -S "$VENDOR_DIR" -B "$BUILD_DIR" \
|
|
-DGGML_CUDA=ON \
|
|
-DLLAMA_CURL=OFF \
|
|
-DLLAMA_BUILD_TESTS=OFF \
|
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
|
-DCMAKE_BUILD_TYPE=Release \
|
|
-DCMAKE_CUDA_ARCHITECTURES="$CUDA_ARCH"
|
|
|
|
echo "==> build llama-server llama-cli (jobs: $(nproc))"
|
|
cmake --build "$BUILD_DIR" --target llama-server llama-cli -j "$(nproc)"
|
|
|
|
if [ ! -x "$SERVER_BIN" ]; then
|
|
echo "ERROR: llama-server did not build at $SERVER_BIN" >&2
|
|
exit 1
|
|
fi
|
|
|
|
echo "=== done ==="
|
|
echo "$SERVER_BIN"
|