Initial AITuner study orchestrator

This commit is contained in:
gahow
2026-04-04 21:26:37 +08:00
commit cdcca1d9d7
24 changed files with 3357 additions and 0 deletions

View File

@@ -0,0 +1,59 @@
version = 1
[paths]
state_dir = ".aituner/gpu_fleet/state"
artifacts_dir = ".aituner/gpu_fleet/artifacts"
[ssh]
connect_timeout_sec = 10
[scheduler]
gpu_free_memory_mb = 1024
gpu_free_utilization_pct = 10
prefer_pack = true
[sync]
mode = "rsync"
local_path = "."
exclude = [
".git/",
".venv/",
".aituner/",
"__pycache__/",
"*.pyc",
]
[[hosts]]
name = "dash0"
ssh_alias = "dash0"
enabled = true
sync_remote_path = "~/workspace/aituner"
fleet_root = "~/.aituner_gpu_fleet"
[[hosts]]
name = "dash1"
ssh_alias = "dash1"
enabled = true
sync_remote_path = "~/workspace/aituner"
fleet_root = "~/.aituner_gpu_fleet"
[[hosts]]
name = "dash2"
ssh_alias = "dash2"
enabled = true
sync_remote_path = "~/workspace/aituner"
fleet_root = "~/.aituner_gpu_fleet"
[[hosts]]
name = "dash3"
ssh_alias = "dash3"
enabled = true
sync_remote_path = "~/aituner"
fleet_root = "~/.aituner_gpu_fleet"
[[hosts]]
name = "dash5"
ssh_alias = "dash5"
enabled = true
sync_remote_path = "~/workspace/aituner"
fleet_root = "~/.aituner_gpu_fleet"

View File

@@ -0,0 +1,27 @@
# This file is an append-only queue source for the monitor.
# Each job name must stay unique and immutable once appended.
version = 1
[[jobs]]
name = "smoke-train-h20-1gpu"
gpus = 1
gpu_model = "H20"
hosts = ["dash0", "dash1", "dash2"]
command = "python train.py --config configs/smoke.toml"
artifacts = ["outputs/smoke-train-h20-1gpu"]
env = { WANDB_MODE = "offline" }
[[jobs]]
name = "eval-5090-4gpu"
gpus = 4
gpu_model = "5090"
hosts = ["dash5"]
command = "python eval.py --config configs/eval.toml"
artifacts = ["outputs/eval-5090-4gpu", "logs/eval-5090-4gpu.log"]
[[jobs]]
name = "special-dash3-run"
gpus = 2
hosts = ["dash3"]
command = "python benchmark.py --suite long-context"
artifacts = ["outputs/special-dash3-run"]

View File

@@ -0,0 +1,8 @@
# One SSH alias per line.
# Lines starting with "#" are ignored.
dash0
dash1
dash2
dash3
dash5