docs: v12 — 1.05B long-ctx base + chat-alpha SFT quality check
- run 12: dim1664/22L true-GQA 1.05B base, seq1024, 6.765B FineWeb tokens, 81h on 8x5090. Fixed eval v1 @seq1024 = 2.7410 vs v11 2.7467 — a real but marginal gain; v11->v12 is a capacity-only step on fixed data, so the ~0.2% return confirms the 1B base is now data-limited. - run 13: three SFT stages from the v12 base (synthetic / anchor / real-mix-repair). The pipeline works and produces a chat-shaped model that follows the format and stops, but none of the variants is a stable high-quality chat model — bottleneck is SFT data quality + selection signal (val loss decouples from generation quality), not infra. - scripts/run_v12_phase.sh wrapper + chat_alpha_fixed_prompts.txt eval set. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
10
scripts/chat_alpha_fixed_prompts.txt
Normal file
10
scripts/chat_alpha_fixed_prompts.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
# One escaped prompt per line. `greedy_sample` decodes literal \n before tokenizing.
|
||||
User: Explain supervised fine-tuning to a junior engineer.\nAssistant:
|
||||
User: What high-quality SFT data are we using now?\nAssistant:
|
||||
User: What training data did chat-alpha-v1 use?\nAssistant:
|
||||
User: What is 17% of 240?\nAssistant:
|
||||
User: I found that my small language model repeats the same phrase during generation. What should I inspect first?\nAssistant:
|
||||
User: Summarize this passage in one sentence: A team trained a base model, then continued with chat examples at a low learning rate. Validation loss improved, but they still need real prompt tests before calling it useful.\nAssistant:
|
||||
User: Who will win the world championship in 2099?\nAssistant:
|
||||
User: Give a compact checklist before launching an SFT run.\nAssistant:
|
||||
User: Write a Python function that returns the larger of two numbers.\nAssistant:
|
||||
329
scripts/run_v12_phase.sh
Executable file
329
scripts/run_v12_phase.sh
Executable file
@@ -0,0 +1,329 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="${XTRAIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
|
||||
cd "$ROOT"
|
||||
|
||||
export PATH="/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH"
|
||||
|
||||
strip_token_cache_suffix() {
|
||||
local path="$1"
|
||||
if [[ "$path" == *.u16.bin ]]; then
|
||||
printf '%s\n' "${path%.u16.bin}"
|
||||
else
|
||||
printf '%s\n' "$path"
|
||||
fi
|
||||
}
|
||||
|
||||
RUN_DIR="${RUN_DIR:-/dashscope-tmp/wjh/xtrain_v12}"
|
||||
TOKENIZER="${TOKENIZER:-/opt/wjh/models/gpt2/tokenizer.json}"
|
||||
CORPUS="${CORPUS:-data/fineweb-edu.txt}"
|
||||
FIXED_EVAL="${FIXED_EVAL:-/dashscope-tmp/wjh/xtrain_fixed_eval_v1/fineweb-fixed-eval-v1.txt}"
|
||||
EXPORT_DIR="${EXPORT_DIR:-/opt/wjh/projects/tiny-models/v12-fineweb-edu-1b-longctx}"
|
||||
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
|
||||
TMUX_SESSION="${TMUX_SESSION:-xtrain_v12}"
|
||||
|
||||
HEADS="${HEADS:-52}"
|
||||
HEAD_DIM="${HEAD_DIM:-32}"
|
||||
KV_HEADS="${KV_HEADS:-13}"
|
||||
LAYERS="${LAYERS:-22}"
|
||||
FFN="${FFN:-6656}"
|
||||
SEQ="${SEQ:-1024}"
|
||||
BATCH="${BATCH:-16}"
|
||||
ACCUM="${ACCUM:-15}"
|
||||
MAX_LR="${MAX_LR:-4e-4}"
|
||||
MIN_LR="${MIN_LR:-4e-5}"
|
||||
VAL_TOKENS="${VAL_TOKENS:-1000000}"
|
||||
EVAL_BATCHES="${EVAL_BATCHES:-64}"
|
||||
FIXED_EVAL_SEQ="${FIXED_EVAL_SEQ:-1024}"
|
||||
FIXED_EVAL_BATCHES="${FIXED_EVAL_BATCHES:-64}"
|
||||
PILOT_STEPS="${PILOT_STEPS:-300}"
|
||||
FULL_STEPS="${FULL_STEPS:-27524}"
|
||||
PILOT_EVAL_EVERY="${PILOT_EVAL_EVERY:-100}"
|
||||
FULL_EVAL_EVERY="${FULL_EVAL_EVERY:-500}"
|
||||
|
||||
CORPUS="$(strip_token_cache_suffix "$CORPUS")"
|
||||
FIXED_EVAL="$(strip_token_cache_suffix "$FIXED_EVAL")"
|
||||
|
||||
ARCH_ARGS=(
|
||||
--heads "$HEADS"
|
||||
--head-dim "$HEAD_DIM"
|
||||
--kv-heads "$KV_HEADS"
|
||||
--layers "$LAYERS"
|
||||
--ffn "$FFN"
|
||||
)
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
usage: scripts/run_v12_phase.sh ACTION
|
||||
|
||||
Actions:
|
||||
build Build xtrain train/export/sample binaries.
|
||||
smoke Run a short no-checkpoint v12 seq1024 smoke test in foreground.
|
||||
pilot Run a 300-step v12 pilot with held-out eval and checkpoint.
|
||||
full Run the full one-epoch v12 base training job.
|
||||
eval-fixed Evaluate a checkpoint on fixed eval v1.
|
||||
sample Run xtrain greedy_sample on fixed chat-alpha prompts.
|
||||
export Export a checkpoint to xserv/tiny-models format.
|
||||
status Print one progress snapshot from RUN_DIR/full.log or pilot.log.
|
||||
monitor Show a refreshing progress dashboard until interrupted.
|
||||
start-pilot Start pilot + monitor in tmux sessions.
|
||||
start-full Start full train + monitor in tmux sessions.
|
||||
|
||||
Environment overrides:
|
||||
RUN_DIR, TOKENIZER, CORPUS, FIXED_EVAL, EXPORT_DIR, CUDA_VISIBLE_DEVICES
|
||||
HEADS, HEAD_DIM, KV_HEADS, LAYERS, FFN, SEQ, BATCH, ACCUM
|
||||
MAX_LR, MIN_LR, PILOT_STEPS, FULL_STEPS, FIXED_EVAL_SEQ
|
||||
EOF
|
||||
}
|
||||
|
||||
build() {
|
||||
cargo build --release -p xtrain-distributed --bin train_ddp
|
||||
cargo build --release -p xtrain-train --bin train --bin export_safetensors --bin greedy_sample
|
||||
}
|
||||
|
||||
write_meta() {
|
||||
local kind="$1"
|
||||
mkdir -p "$RUN_DIR"
|
||||
{
|
||||
echo "run=$kind"
|
||||
echo "created_utc=$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
|
||||
echo "arch=heads${HEADS}_hd${HEAD_DIM}_kv${KV_HEADS}_layers${LAYERS}_ffn${FFN}"
|
||||
echo "seq=$SEQ"
|
||||
echo "batch=$BATCH"
|
||||
echo "accum=$ACCUM"
|
||||
echo "effective_batch=$((BATCH * ACCUM))"
|
||||
echo "tokens_per_step=$((BATCH * ACCUM * SEQ))"
|
||||
echo "max_lr=$MAX_LR"
|
||||
echo "min_lr=$MIN_LR"
|
||||
echo "corpus=$CORPUS"
|
||||
echo "fixed_eval=$FIXED_EVAL"
|
||||
echo "fixed_eval_seq=$FIXED_EVAL_SEQ"
|
||||
} > "$RUN_DIR/META.txt"
|
||||
}
|
||||
|
||||
write_env_file() {
|
||||
mkdir -p "$RUN_DIR"
|
||||
local env_file="$RUN_DIR/env.sh"
|
||||
: > "$env_file"
|
||||
local names=(
|
||||
XTRAIN_ROOT RUN_DIR TOKENIZER CORPUS FIXED_EVAL EXPORT_DIR CUDA_VISIBLE_DEVICES
|
||||
TMUX_SESSION HEADS HEAD_DIM KV_HEADS LAYERS FFN SEQ BATCH ACCUM MAX_LR MIN_LR
|
||||
VAL_TOKENS EVAL_BATCHES FIXED_EVAL_SEQ FIXED_EVAL_BATCHES PILOT_STEPS
|
||||
FULL_STEPS PILOT_EVAL_EVERY FULL_EVAL_EVERY
|
||||
)
|
||||
for name in "${names[@]}"; do
|
||||
if [[ "$name" == "XTRAIN_ROOT" ]]; then
|
||||
printf 'export XTRAIN_ROOT=%q\n' "$ROOT" >> "$env_file"
|
||||
else
|
||||
printf 'export %s=%q\n' "$name" "${!name}" >> "$env_file"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
run_train() {
|
||||
local kind="$1"
|
||||
local steps="$2"
|
||||
local eval_every="$3"
|
||||
local ckpt="$4"
|
||||
local log="$RUN_DIR/${kind}.log"
|
||||
write_meta "$kind"
|
||||
echo "$steps" > "$RUN_DIR/${kind}.steps"
|
||||
echo "$((BATCH * ACCUM * SEQ))" > "$RUN_DIR/${kind}.tokens_per_step"
|
||||
{
|
||||
echo "RUN_NAME=xtrain_v12_${kind}"
|
||||
echo "RUN_START_ISO=$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
|
||||
echo "RUN_START_EPOCH=$(date +%s)"
|
||||
echo "CKPT=$ckpt"
|
||||
echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
|
||||
echo "TOTAL_STEPS=$steps"
|
||||
echo "TOKENS_PER_STEP=$((BATCH * ACCUM * SEQ))"
|
||||
set -x
|
||||
set +e
|
||||
if [[ -n "$ckpt" ]]; then
|
||||
CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" target/release/train_ddp \
|
||||
"$TOKENIZER" "$CORPUS" \
|
||||
"${ARCH_ARGS[@]}" \
|
||||
--steps "$steps" --batch "$BATCH" --accum-steps "$ACCUM" --seq "$SEQ" \
|
||||
--max-lr "$MAX_LR" --min-lr "$MIN_LR" \
|
||||
--val-tokens "$VAL_TOKENS" --eval-every "$eval_every" --eval-batches "$EVAL_BATCHES" \
|
||||
--bf16 --recompute --flash --dropout 0.0 \
|
||||
--ckpt "$ckpt"
|
||||
rc=$?
|
||||
else
|
||||
CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" target/release/train_ddp \
|
||||
"$TOKENIZER" "$CORPUS" \
|
||||
"${ARCH_ARGS[@]}" \
|
||||
--steps "$steps" --batch "$BATCH" --accum-steps "$ACCUM" --seq "$SEQ" \
|
||||
--max-lr "$MAX_LR" --min-lr "$MIN_LR" \
|
||||
--val-tokens 0 --eval-every 0 --eval-batches "$EVAL_BATCHES" \
|
||||
--bf16 --recompute --flash --dropout 0.0
|
||||
rc=$?
|
||||
fi
|
||||
set -e
|
||||
set +x
|
||||
echo "RUN_END_ISO=$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
|
||||
echo "RUN_EXIT_CODE=$rc"
|
||||
exit "$rc"
|
||||
} 2>&1 | tee "$log"
|
||||
}
|
||||
|
||||
checkpoint_path() {
|
||||
local preferred="$RUN_DIR/xtrain_v12.ckpt"
|
||||
local pilot="$RUN_DIR/xtrain_v12_pilot.ckpt"
|
||||
if [[ -n "${CKPT:-}" ]]; then
|
||||
echo "$CKPT"
|
||||
elif [[ -f "$preferred" ]]; then
|
||||
echo "$preferred"
|
||||
else
|
||||
echo "$pilot"
|
||||
fi
|
||||
}
|
||||
|
||||
eval_fixed() {
|
||||
local ckpt
|
||||
ckpt="$(checkpoint_path)"
|
||||
target/release/train \
|
||||
"$TOKENIZER" "$FIXED_EVAL" \
|
||||
"${ARCH_ARGS[@]}" \
|
||||
--seq "$FIXED_EVAL_SEQ" --batch 1 --steps 1 \
|
||||
--val-tokens "$VAL_TOKENS" --eval-batches "$FIXED_EVAL_BATCHES" \
|
||||
--bf16 --recompute --flash \
|
||||
--eval-ckpt "$ckpt" \
|
||||
2>&1 | tee "$RUN_DIR/eval_fixed.log"
|
||||
}
|
||||
|
||||
sample_fixed() {
|
||||
local ckpt
|
||||
ckpt="$(checkpoint_path)"
|
||||
target/release/greedy_sample \
|
||||
"$ckpt" "$TOKENIZER" \
|
||||
"${ARCH_ARGS[@]}" \
|
||||
--max-tokens "${MAX_TOKENS:-120}" \
|
||||
--temperature "${TEMPERATURE:-0}" \
|
||||
--prompts-file "${PROMPTS_FILE:-scripts/chat_alpha_fixed_prompts.txt}" \
|
||||
2>&1 | tee "$RUN_DIR/sample_fixed.log"
|
||||
}
|
||||
|
||||
export_model() {
|
||||
local ckpt
|
||||
ckpt="$(checkpoint_path)"
|
||||
rm -rf "$EXPORT_DIR"
|
||||
target/release/export_safetensors \
|
||||
"$ckpt" "$TOKENIZER" "$EXPORT_DIR" \
|
||||
"${ARCH_ARGS[@]}"
|
||||
cp "$ckpt" "$EXPORT_DIR/xtrain.ckpt"
|
||||
echo "$EXPORT_DIR" | tee "$RUN_DIR/export_path.txt"
|
||||
}
|
||||
|
||||
progress_once() {
|
||||
local log="${1:-$RUN_DIR/full.log}"
|
||||
[[ -f "$log" ]] || log="$RUN_DIR/pilot.log"
|
||||
python3 - "$log" <<'PY'
|
||||
import os, re, sys, time
|
||||
log = sys.argv[1]
|
||||
text = open(log, errors="ignore").read() if os.path.exists(log) else ""
|
||||
steps = re.findall(r"\[rank0\] step\s+(\d+)/(\d+): loss\s+(\S+) lr\s+(\S+) gnorm\s+(\S+) \((\S+) tok/s global", text)
|
||||
evals = re.findall(r"eval @ step\s+(\d+): val loss\s+(\S+)( \(best\))?", text)
|
||||
start = re.search(r"RUN_START_EPOCH=(\d+)", text)
|
||||
tokens_per_step = re.search(r"TOKENS_PER_STEP=(\d+)", text)
|
||||
tokens_per_step = int(tokens_per_step.group(1)) if tokens_per_step else 245760
|
||||
exit_code = re.search(r"RUN_EXIT_CODE=(\d+)", text)
|
||||
warnings = re.findall(r"(?i)(nan|inf|oom|out of memory|panic|error)", text)
|
||||
print("xtrain v12 |", time.strftime("%Y-%m-%d %H:%M:%S %Z"), "| log:", log)
|
||||
if warnings:
|
||||
print("WARNING: suspicious log tokens:", ", ".join(sorted(set(w.lower() for w in warnings))[:8]))
|
||||
if not steps:
|
||||
print("waiting for first rank0 step")
|
||||
else:
|
||||
s, total, loss, lr, gnorm, tps = steps[-1]
|
||||
done = int(s) + 1
|
||||
total = int(total)
|
||||
pct = min(100.0, done * 100.0 / total)
|
||||
width = 44
|
||||
fill = int(width * pct / 100.0)
|
||||
bar = "#" * fill + "." * (width - fill)
|
||||
try:
|
||||
tpsf = float(tps)
|
||||
except ValueError:
|
||||
tpsf = 0.0
|
||||
elapsed = time.time() - int(start.group(1)) if start else None
|
||||
eta = (total - done) * tokens_per_step / tpsf if tpsf > 0 else None
|
||||
def fmt(sec):
|
||||
if sec is None:
|
||||
return "n/a"
|
||||
sec = int(max(0, sec))
|
||||
h, r = divmod(sec, 3600)
|
||||
m, s = divmod(r, 60)
|
||||
return f"{h:02d}:{m:02d}:{s:02d}"
|
||||
print(f"[{bar}] {pct:6.2f}%")
|
||||
print(f"step {done}/{total} | loss {loss} | lr {lr} | gnorm {gnorm}")
|
||||
print(f"speed {tpsf:,.0f} tok/s | elapsed {fmt(elapsed)} | ETA {fmt(eta)}")
|
||||
if evals:
|
||||
s, v, best = evals[-1]
|
||||
best_vals = []
|
||||
for _, vv, mark in evals:
|
||||
if not mark:
|
||||
continue
|
||||
try:
|
||||
best_vals.append(float(vv))
|
||||
except ValueError:
|
||||
pass
|
||||
best_txt = f"best {min(best_vals):.4f}" if best_vals else "best n/a"
|
||||
try:
|
||||
val_txt = f"{float(v):.4f}"
|
||||
except ValueError:
|
||||
val_txt = v
|
||||
print(f"eval step {int(s)+1}: val {val_txt} {best.strip()} | {best_txt}")
|
||||
else:
|
||||
print("eval: waiting")
|
||||
if exit_code:
|
||||
print("FINISHED exit code", exit_code.group(1))
|
||||
PY
|
||||
echo
|
||||
nvidia-smi --query-gpu=index,memory.used,utilization.gpu --format=csv,noheader,nounits \
|
||||
| awk -F, '{printf "gpu%s %sMiB %s%% ", $1, $2, $3} NR%4==0{print ""} END{print ""}'
|
||||
df -h /dashscope-tmp | awk 'NR==2{print "Disk: "$4" free ("$5" used)"}'
|
||||
}
|
||||
|
||||
monitor() {
|
||||
while true; do
|
||||
clear
|
||||
progress_once
|
||||
sleep "${MONITOR_INTERVAL:-30}"
|
||||
done
|
||||
}
|
||||
|
||||
start_tmux() {
|
||||
local kind="$1"
|
||||
local session="$TMUX_SESSION"
|
||||
if tmux has-session -t "=${session}" 2>/dev/null; then
|
||||
echo "tmux session already exists: $session"
|
||||
echo "attach: tmux attach -t $session"
|
||||
exit 1
|
||||
fi
|
||||
write_env_file
|
||||
tmux new-session -d -s "$session" "bash -lc 'source \"$RUN_DIR/env.sh\" && cd \"$ROOT\" && scripts/run_v12_phase.sh $kind'"
|
||||
if ! tmux has-session -t "=${session}_mon" 2>/dev/null; then
|
||||
tmux new-session -d -s "${session}_mon" "bash -lc 'source \"$RUN_DIR/env.sh\" && cd \"$ROOT\" && scripts/run_v12_phase.sh monitor'"
|
||||
fi
|
||||
echo "started $kind in tmux: $session"
|
||||
echo "monitor: tmux attach -t ${session}_mon"
|
||||
}
|
||||
|
||||
action="${1:-}"
|
||||
case "$action" in
|
||||
build) build ;;
|
||||
smoke) build; run_train smoke "${SMOKE_STEPS:-30}" 0 "" ;;
|
||||
pilot) build; run_train pilot "$PILOT_STEPS" "$PILOT_EVAL_EVERY" "$RUN_DIR/xtrain_v12_pilot.ckpt" ;;
|
||||
full) build; run_train full "$FULL_STEPS" "$FULL_EVAL_EVERY" "$RUN_DIR/xtrain_v12.ckpt" ;;
|
||||
eval-fixed) build; eval_fixed ;;
|
||||
sample) build; sample_fixed ;;
|
||||
export) build; export_model ;;
|
||||
status) progress_once ;;
|
||||
monitor) monitor ;;
|
||||
start-pilot) start_tmux pilot ;;
|
||||
start-full) start_tmux full ;;
|
||||
""|-h|--help|help) usage ;;
|
||||
*) echo "unknown action: $action" >&2; usage >&2; exit 2 ;;
|
||||
esac
|
||||
Reference in New Issue
Block a user