xserv/tools/convert-to-gguf.sh

#!/usr/bin/env bash
# Convert a HuggingFace safetensors model dir into a BF16 GGUF for llama.cpp.
#
# Why BF16: we run xserv in BF16, so the baseline must run BF16 too. If we
# compared xserv-BF16 against llama.cpp-Q4_K_M the speed delta would be
# dominated by quantization, not by our kernels — that's not an apples-to-
# apples comparison.
#
# Usage:
#   tools/convert-to-gguf.sh <hf-model-dir> [out.gguf]
#
# Example:
#   tools/convert-to-gguf.sh /opt/wjh/models/qwen3-8b
#   # → /opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf

set -euo pipefail

if [ "$#" -lt 1 ]; then
    echo "Usage: $0 <hf-model-dir> [out.gguf]" >&2
    exit 1
fi

SRC="$(realpath "$1")"
ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
CONVERT_PY="$ROOT_DIR/third_party/llama.cpp/convert_hf_to_gguf.py"

if [ ! -f "$CONVERT_PY" ]; then
    echo "convert script not found: $CONVERT_PY" >&2
    echo "Run tools/setup-llama-cpp.sh first." >&2
    exit 1
fi

if [ ! -d "$SRC" ]; then
    echo "source model dir not found: $SRC" >&2
    exit 1
fi

if [ "$#" -ge 2 ]; then
    OUT="$2"
else
    BASENAME="$(basename "$SRC")"
    OUT="$SRC/${BASENAME}-bf16.gguf"
fi

if [ -f "$OUT" ]; then
    echo "==> already exists: $OUT (skipping; remove to force re-convert)"
    echo "$OUT"
    exit 0
fi

echo "==> converting $SRC -> $OUT (BF16)"
python3 "$CONVERT_PY" "$SRC" --outfile "$OUT" --outtype bf16

echo "=== done ==="
echo "$OUT"