Initial AITuner study orchestrator

2026-04-04 21:26:37 +08:00
commit cdcca1d9d7
24 changed files with 3357 additions and 0 deletions
--- a/infra/gpu_fleet/config/fleet.example.toml
+++ b/infra/gpu_fleet/config/fleet.example.toml
@@ -0,0 +1,59 @@
+version = 1
+
+[paths]
+state_dir = ".aituner/gpu_fleet/state"
+artifacts_dir = ".aituner/gpu_fleet/artifacts"
+
+[ssh]
+connect_timeout_sec = 10
+
+[scheduler]
+gpu_free_memory_mb = 1024
+gpu_free_utilization_pct = 10
+prefer_pack = true
+
+[sync]
+mode = "rsync"
+local_path = "."
+exclude = [
+  ".git/",
+  ".venv/",
+  ".aituner/",
+  "__pycache__/",
+  "*.pyc",
+]
+
+[[hosts]]
+name = "dash0"
+ssh_alias = "dash0"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash1"
+ssh_alias = "dash1"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash2"
+ssh_alias = "dash2"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash3"
+ssh_alias = "dash3"
+enabled = true
+sync_remote_path = "~/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash5"
+ssh_alias = "dash5"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
--- a/infra/gpu_fleet/config/jobs.example.toml
+++ b/infra/gpu_fleet/config/jobs.example.toml
@@ -0,0 +1,27 @@
+ # This file is an append-only queue source for the monitor.
+ # Each job name must stay unique and immutable once appended.
+version = 1
+
+[[jobs]]
+name = "smoke-train-h20-1gpu"
+gpus = 1
+gpu_model = "H20"
+hosts = ["dash0", "dash1", "dash2"]
+command = "python train.py --config configs/smoke.toml"
+artifacts = ["outputs/smoke-train-h20-1gpu"]
+env = { WANDB_MODE = "offline" }
+
+[[jobs]]
+name = "eval-5090-4gpu"
+gpus = 4
+gpu_model = "5090"
+hosts = ["dash5"]
+command = "python eval.py --config configs/eval.toml"
+artifacts = ["outputs/eval-5090-4gpu", "logs/eval-5090-4gpu.log"]
+
+[[jobs]]
+name = "special-dash3-run"
+gpus = 2
+hosts = ["dash3"]
+command = "python benchmark.py --suite long-context"
+artifacts = ["outputs/special-dash3-run"]
--- a/infra/gpu_fleet/config/ssh_aliases.example.txt
+++ b/infra/gpu_fleet/config/ssh_aliases.example.txt
@@ -0,0 +1,8 @@
+# One SSH alias per line.
+# Lines starting with "#" are ignored.
+dash0
+dash1
+dash2
+dash3
+dash5
+
--- a/infra/gpu_fleet/gpu_fleet.py
+++ b/infra/gpu_fleet/gpu_fleet.py