Files
aituner/infra/gpu_fleet/config/jobs.example.toml
2026-04-04 21:26:37 +08:00

28 lines
732 B
TOML

# This file is an append-only queue source for the monitor.
# Each job name must stay unique and immutable once appended.
version = 1
[[jobs]]
name = "smoke-train-h20-1gpu"
gpus = 1
gpu_model = "H20"
hosts = ["dash0", "dash1", "dash2"]
command = "python train.py --config configs/smoke.toml"
artifacts = ["outputs/smoke-train-h20-1gpu"]
env = { WANDB_MODE = "offline" }
[[jobs]]
name = "eval-5090-4gpu"
gpus = 4
gpu_model = "5090"
hosts = ["dash5"]
command = "python eval.py --config configs/eval.toml"
artifacts = ["outputs/eval-5090-4gpu", "logs/eval-5090-4gpu.log"]
[[jobs]]
name = "special-dash3-run"
gpus = 2
hosts = ["dash3"]
command = "python benchmark.py --suite long-context"
artifacts = ["outputs/special-dash3-run"]