#!/usr/bin/env bash set -euo pipefail ROOT="${XTRAIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}" cd "$ROOT" export PATH="/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH" strip_token_cache_suffix() { local path="$1" if [[ "$path" == *.u16.bin ]]; then printf '%s\n' "${path%.u16.bin}" else printf '%s\n' "$path" fi } RUN_DIR="${RUN_DIR:-/dashscope-tmp/wjh/xtrain_v12}" TOKENIZER="${TOKENIZER:-/opt/wjh/models/gpt2/tokenizer.json}" CORPUS="${CORPUS:-data/fineweb-edu.txt}" FIXED_EVAL="${FIXED_EVAL:-/dashscope-tmp/wjh/xtrain_fixed_eval_v1/fineweb-fixed-eval-v1.txt}" EXPORT_DIR="${EXPORT_DIR:-/opt/wjh/projects/tiny-models/v12-fineweb-edu-1b-longctx}" CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" TMUX_SESSION="${TMUX_SESSION:-xtrain_v12}" HEADS="${HEADS:-52}" HEAD_DIM="${HEAD_DIM:-32}" KV_HEADS="${KV_HEADS:-13}" LAYERS="${LAYERS:-22}" FFN="${FFN:-6656}" SEQ="${SEQ:-1024}" BATCH="${BATCH:-16}" ACCUM="${ACCUM:-15}" MAX_LR="${MAX_LR:-4e-4}" MIN_LR="${MIN_LR:-4e-5}" VAL_TOKENS="${VAL_TOKENS:-1000000}" EVAL_BATCHES="${EVAL_BATCHES:-64}" FIXED_EVAL_SEQ="${FIXED_EVAL_SEQ:-1024}" FIXED_EVAL_BATCHES="${FIXED_EVAL_BATCHES:-64}" PILOT_STEPS="${PILOT_STEPS:-300}" FULL_STEPS="${FULL_STEPS:-27524}" PILOT_EVAL_EVERY="${PILOT_EVAL_EVERY:-100}" FULL_EVAL_EVERY="${FULL_EVAL_EVERY:-500}" CORPUS="$(strip_token_cache_suffix "$CORPUS")" FIXED_EVAL="$(strip_token_cache_suffix "$FIXED_EVAL")" ARCH_ARGS=( --heads "$HEADS" --head-dim "$HEAD_DIM" --kv-heads "$KV_HEADS" --layers "$LAYERS" --ffn "$FFN" ) usage() { cat <<'EOF' usage: scripts/run_v12_phase.sh ACTION Actions: build Build xtrain train/export/sample binaries. smoke Run a short no-checkpoint v12 seq1024 smoke test in foreground. pilot Run a 300-step v12 pilot with held-out eval and checkpoint. full Run the full one-epoch v12 base training job. eval-fixed Evaluate a checkpoint on fixed eval v1. sample Run xtrain greedy_sample on fixed chat-alpha prompts. export Export a checkpoint to xserv/tiny-models format. status Print one progress snapshot from RUN_DIR/full.log or pilot.log. monitor Show a refreshing progress dashboard until interrupted. start-pilot Start pilot + monitor in tmux sessions. start-full Start full train + monitor in tmux sessions. Environment overrides: RUN_DIR, TOKENIZER, CORPUS, FIXED_EVAL, EXPORT_DIR, CUDA_VISIBLE_DEVICES HEADS, HEAD_DIM, KV_HEADS, LAYERS, FFN, SEQ, BATCH, ACCUM MAX_LR, MIN_LR, PILOT_STEPS, FULL_STEPS, FIXED_EVAL_SEQ EOF } build() { cargo build --release -p xtrain-distributed --bin train_ddp cargo build --release -p xtrain-train --bin train --bin export_safetensors --bin greedy_sample } write_meta() { local kind="$1" mkdir -p "$RUN_DIR" { echo "run=$kind" echo "created_utc=$(date -u '+%Y-%m-%dT%H:%M:%SZ')" echo "arch=heads${HEADS}_hd${HEAD_DIM}_kv${KV_HEADS}_layers${LAYERS}_ffn${FFN}" echo "seq=$SEQ" echo "batch=$BATCH" echo "accum=$ACCUM" echo "effective_batch=$((BATCH * ACCUM))" echo "tokens_per_step=$((BATCH * ACCUM * SEQ))" echo "max_lr=$MAX_LR" echo "min_lr=$MIN_LR" echo "corpus=$CORPUS" echo "fixed_eval=$FIXED_EVAL" echo "fixed_eval_seq=$FIXED_EVAL_SEQ" } > "$RUN_DIR/META.txt" } write_env_file() { mkdir -p "$RUN_DIR" local env_file="$RUN_DIR/env.sh" : > "$env_file" local names=( XTRAIN_ROOT RUN_DIR TOKENIZER CORPUS FIXED_EVAL EXPORT_DIR CUDA_VISIBLE_DEVICES TMUX_SESSION HEADS HEAD_DIM KV_HEADS LAYERS FFN SEQ BATCH ACCUM MAX_LR MIN_LR VAL_TOKENS EVAL_BATCHES FIXED_EVAL_SEQ FIXED_EVAL_BATCHES PILOT_STEPS FULL_STEPS PILOT_EVAL_EVERY FULL_EVAL_EVERY ) for name in "${names[@]}"; do if [[ "$name" == "XTRAIN_ROOT" ]]; then printf 'export XTRAIN_ROOT=%q\n' "$ROOT" >> "$env_file" else printf 'export %s=%q\n' "$name" "${!name}" >> "$env_file" fi done } run_train() { local kind="$1" local steps="$2" local eval_every="$3" local ckpt="$4" local log="$RUN_DIR/${kind}.log" write_meta "$kind" echo "$steps" > "$RUN_DIR/${kind}.steps" echo "$((BATCH * ACCUM * SEQ))" > "$RUN_DIR/${kind}.tokens_per_step" { echo "RUN_NAME=xtrain_v12_${kind}" echo "RUN_START_ISO=$(date -u '+%Y-%m-%dT%H:%M:%SZ')" echo "RUN_START_EPOCH=$(date +%s)" echo "CKPT=$ckpt" echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" echo "TOTAL_STEPS=$steps" echo "TOKENS_PER_STEP=$((BATCH * ACCUM * SEQ))" set -x set +e if [[ -n "$ckpt" ]]; then CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" target/release/train_ddp \ "$TOKENIZER" "$CORPUS" \ "${ARCH_ARGS[@]}" \ --steps "$steps" --batch "$BATCH" --accum-steps "$ACCUM" --seq "$SEQ" \ --max-lr "$MAX_LR" --min-lr "$MIN_LR" \ --val-tokens "$VAL_TOKENS" --eval-every "$eval_every" --eval-batches "$EVAL_BATCHES" \ --bf16 --recompute --flash --dropout 0.0 \ --ckpt "$ckpt" rc=$? else CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" target/release/train_ddp \ "$TOKENIZER" "$CORPUS" \ "${ARCH_ARGS[@]}" \ --steps "$steps" --batch "$BATCH" --accum-steps "$ACCUM" --seq "$SEQ" \ --max-lr "$MAX_LR" --min-lr "$MIN_LR" \ --val-tokens 0 --eval-every 0 --eval-batches "$EVAL_BATCHES" \ --bf16 --recompute --flash --dropout 0.0 rc=$? fi set -e set +x echo "RUN_END_ISO=$(date -u '+%Y-%m-%dT%H:%M:%SZ')" echo "RUN_EXIT_CODE=$rc" exit "$rc" } 2>&1 | tee "$log" } checkpoint_path() { local preferred="$RUN_DIR/xtrain_v12.ckpt" local pilot="$RUN_DIR/xtrain_v12_pilot.ckpt" if [[ -n "${CKPT:-}" ]]; then echo "$CKPT" elif [[ -f "$preferred" ]]; then echo "$preferred" else echo "$pilot" fi } eval_fixed() { local ckpt ckpt="$(checkpoint_path)" target/release/train \ "$TOKENIZER" "$FIXED_EVAL" \ "${ARCH_ARGS[@]}" \ --seq "$FIXED_EVAL_SEQ" --batch 1 --steps 1 \ --val-tokens "$VAL_TOKENS" --eval-batches "$FIXED_EVAL_BATCHES" \ --bf16 --recompute --flash \ --eval-ckpt "$ckpt" \ 2>&1 | tee "$RUN_DIR/eval_fixed.log" } sample_fixed() { local ckpt ckpt="$(checkpoint_path)" target/release/greedy_sample \ "$ckpt" "$TOKENIZER" \ "${ARCH_ARGS[@]}" \ --max-tokens "${MAX_TOKENS:-120}" \ --temperature "${TEMPERATURE:-0}" \ --prompts-file "${PROMPTS_FILE:-scripts/chat_alpha_fixed_prompts.txt}" \ 2>&1 | tee "$RUN_DIR/sample_fixed.log" } export_model() { local ckpt ckpt="$(checkpoint_path)" rm -rf "$EXPORT_DIR" target/release/export_safetensors \ "$ckpt" "$TOKENIZER" "$EXPORT_DIR" \ "${ARCH_ARGS[@]}" cp "$ckpt" "$EXPORT_DIR/xtrain.ckpt" echo "$EXPORT_DIR" | tee "$RUN_DIR/export_path.txt" } progress_once() { local log="${1:-$RUN_DIR/full.log}" [[ -f "$log" ]] || log="$RUN_DIR/pilot.log" python3 - "$log" <<'PY' import os, re, sys, time log = sys.argv[1] text = open(log, errors="ignore").read() if os.path.exists(log) else "" steps = re.findall(r"\[rank0\] step\s+(\d+)/(\d+): loss\s+(\S+) lr\s+(\S+) gnorm\s+(\S+) \((\S+) tok/s global", text) evals = re.findall(r"eval @ step\s+(\d+): val loss\s+(\S+)( \(best\))?", text) start = re.search(r"RUN_START_EPOCH=(\d+)", text) tokens_per_step = re.search(r"TOKENS_PER_STEP=(\d+)", text) tokens_per_step = int(tokens_per_step.group(1)) if tokens_per_step else 245760 exit_code = re.search(r"RUN_EXIT_CODE=(\d+)", text) warnings = re.findall(r"(?i)(nan|inf|oom|out of memory|panic|error)", text) print("xtrain v12 |", time.strftime("%Y-%m-%d %H:%M:%S %Z"), "| log:", log) if warnings: print("WARNING: suspicious log tokens:", ", ".join(sorted(set(w.lower() for w in warnings))[:8])) if not steps: print("waiting for first rank0 step") else: s, total, loss, lr, gnorm, tps = steps[-1] done = int(s) + 1 total = int(total) pct = min(100.0, done * 100.0 / total) width = 44 fill = int(width * pct / 100.0) bar = "#" * fill + "." * (width - fill) try: tpsf = float(tps) except ValueError: tpsf = 0.0 elapsed = time.time() - int(start.group(1)) if start else None eta = (total - done) * tokens_per_step / tpsf if tpsf > 0 else None def fmt(sec): if sec is None: return "n/a" sec = int(max(0, sec)) h, r = divmod(sec, 3600) m, s = divmod(r, 60) return f"{h:02d}:{m:02d}:{s:02d}" print(f"[{bar}] {pct:6.2f}%") print(f"step {done}/{total} | loss {loss} | lr {lr} | gnorm {gnorm}") print(f"speed {tpsf:,.0f} tok/s | elapsed {fmt(elapsed)} | ETA {fmt(eta)}") if evals: s, v, best = evals[-1] best_vals = [] for _, vv, mark in evals: if not mark: continue try: best_vals.append(float(vv)) except ValueError: pass best_txt = f"best {min(best_vals):.4f}" if best_vals else "best n/a" try: val_txt = f"{float(v):.4f}" except ValueError: val_txt = v print(f"eval step {int(s)+1}: val {val_txt} {best.strip()} | {best_txt}") else: print("eval: waiting") if exit_code: print("FINISHED exit code", exit_code.group(1)) PY echo nvidia-smi --query-gpu=index,memory.used,utilization.gpu --format=csv,noheader,nounits \ | awk -F, '{printf "gpu%s %sMiB %s%% ", $1, $2, $3} NR%4==0{print ""} END{print ""}' df -h /dashscope-tmp | awk 'NR==2{print "Disk: "$4" free ("$5" used)"}' } monitor() { while true; do clear progress_once sleep "${MONITOR_INTERVAL:-30}" done } start_tmux() { local kind="$1" local session="$TMUX_SESSION" if tmux has-session -t "=${session}" 2>/dev/null; then echo "tmux session already exists: $session" echo "attach: tmux attach -t $session" exit 1 fi write_env_file tmux new-session -d -s "$session" "bash -lc 'source \"$RUN_DIR/env.sh\" && cd \"$ROOT\" && scripts/run_v12_phase.sh $kind'" if ! tmux has-session -t "=${session}_mon" 2>/dev/null; then tmux new-session -d -s "${session}_mon" "bash -lc 'source \"$RUN_DIR/env.sh\" && cd \"$ROOT\" && scripts/run_v12_phase.sh monitor'" fi echo "started $kind in tmux: $session" echo "monitor: tmux attach -t ${session}_mon" } action="${1:-}" case "$action" in build) build ;; smoke) build; run_train smoke "${SMOKE_STEPS:-30}" 0 "" ;; pilot) build; run_train pilot "$PILOT_STEPS" "$PILOT_EVAL_EVERY" "$RUN_DIR/xtrain_v12_pilot.ckpt" ;; full) build; run_train full "$FULL_STEPS" "$FULL_EVAL_EVERY" "$RUN_DIR/xtrain_v12.ckpt" ;; eval-fixed) build; eval_fixed ;; sample) build; sample_fixed ;; export) build; export_model ;; status) progress_once ;; monitor) monitor ;; start-pilot) start_tmux pilot ;; start-full) start_tmux full ;; ""|-h|--help|help) usage ;; *) echo "unknown action: $action" >&2; usage >&2; exit 2 ;; esac