Initial AITuner study orchestrator

2026-04-04 21:26:37 +08:00
commit cdcca1d9d7
24 changed files with 3357 additions and 0 deletions
--- a/infra/gpu_fleet/config/fleet.example.toml
+++ b/infra/gpu_fleet/config/fleet.example.toml
@@ -0,0 +1,59 @@
+version = 1
+
+[paths]
+state_dir = ".aituner/gpu_fleet/state"
+artifacts_dir = ".aituner/gpu_fleet/artifacts"
+
+[ssh]
+connect_timeout_sec = 10
+
+[scheduler]
+gpu_free_memory_mb = 1024
+gpu_free_utilization_pct = 10
+prefer_pack = true
+
+[sync]
+mode = "rsync"
+local_path = "."
+exclude = [
+  ".git/",
+  ".venv/",
+  ".aituner/",
+  "__pycache__/",
+  "*.pyc",
+]
+
+[[hosts]]
+name = "dash0"
+ssh_alias = "dash0"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash1"
+ssh_alias = "dash1"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash2"
+ssh_alias = "dash2"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash3"
+ssh_alias = "dash3"
+enabled = true
+sync_remote_path = "~/aituner"
+fleet_root = "~/.aituner_gpu_fleet"
+
+[[hosts]]
+name = "dash5"
+ssh_alias = "dash5"
+enabled = true
+sync_remote_path = "~/workspace/aituner"
+fleet_root = "~/.aituner_gpu_fleet"