Initial AITuner study orchestrator
This commit is contained in:
59
infra/gpu_fleet/config/fleet.example.toml
Normal file
59
infra/gpu_fleet/config/fleet.example.toml
Normal file
@@ -0,0 +1,59 @@
|
||||
version = 1
|
||||
|
||||
[paths]
|
||||
state_dir = ".aituner/gpu_fleet/state"
|
||||
artifacts_dir = ".aituner/gpu_fleet/artifacts"
|
||||
|
||||
[ssh]
|
||||
connect_timeout_sec = 10
|
||||
|
||||
[scheduler]
|
||||
gpu_free_memory_mb = 1024
|
||||
gpu_free_utilization_pct = 10
|
||||
prefer_pack = true
|
||||
|
||||
[sync]
|
||||
mode = "rsync"
|
||||
local_path = "."
|
||||
exclude = [
|
||||
".git/",
|
||||
".venv/",
|
||||
".aituner/",
|
||||
"__pycache__/",
|
||||
"*.pyc",
|
||||
]
|
||||
|
||||
[[hosts]]
|
||||
name = "dash0"
|
||||
ssh_alias = "dash0"
|
||||
enabled = true
|
||||
sync_remote_path = "~/workspace/aituner"
|
||||
fleet_root = "~/.aituner_gpu_fleet"
|
||||
|
||||
[[hosts]]
|
||||
name = "dash1"
|
||||
ssh_alias = "dash1"
|
||||
enabled = true
|
||||
sync_remote_path = "~/workspace/aituner"
|
||||
fleet_root = "~/.aituner_gpu_fleet"
|
||||
|
||||
[[hosts]]
|
||||
name = "dash2"
|
||||
ssh_alias = "dash2"
|
||||
enabled = true
|
||||
sync_remote_path = "~/workspace/aituner"
|
||||
fleet_root = "~/.aituner_gpu_fleet"
|
||||
|
||||
[[hosts]]
|
||||
name = "dash3"
|
||||
ssh_alias = "dash3"
|
||||
enabled = true
|
||||
sync_remote_path = "~/aituner"
|
||||
fleet_root = "~/.aituner_gpu_fleet"
|
||||
|
||||
[[hosts]]
|
||||
name = "dash5"
|
||||
ssh_alias = "dash5"
|
||||
enabled = true
|
||||
sync_remote_path = "~/workspace/aituner"
|
||||
fleet_root = "~/.aituner_gpu_fleet"
|
||||
27
infra/gpu_fleet/config/jobs.example.toml
Normal file
27
infra/gpu_fleet/config/jobs.example.toml
Normal file
@@ -0,0 +1,27 @@
|
||||
# This file is an append-only queue source for the monitor.
|
||||
# Each job name must stay unique and immutable once appended.
|
||||
version = 1
|
||||
|
||||
[[jobs]]
|
||||
name = "smoke-train-h20-1gpu"
|
||||
gpus = 1
|
||||
gpu_model = "H20"
|
||||
hosts = ["dash0", "dash1", "dash2"]
|
||||
command = "python train.py --config configs/smoke.toml"
|
||||
artifacts = ["outputs/smoke-train-h20-1gpu"]
|
||||
env = { WANDB_MODE = "offline" }
|
||||
|
||||
[[jobs]]
|
||||
name = "eval-5090-4gpu"
|
||||
gpus = 4
|
||||
gpu_model = "5090"
|
||||
hosts = ["dash5"]
|
||||
command = "python eval.py --config configs/eval.toml"
|
||||
artifacts = ["outputs/eval-5090-4gpu", "logs/eval-5090-4gpu.log"]
|
||||
|
||||
[[jobs]]
|
||||
name = "special-dash3-run"
|
||||
gpus = 2
|
||||
hosts = ["dash3"]
|
||||
command = "python benchmark.py --suite long-context"
|
||||
artifacts = ["outputs/special-dash3-run"]
|
||||
8
infra/gpu_fleet/config/ssh_aliases.example.txt
Normal file
8
infra/gpu_fleet/config/ssh_aliases.example.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
# One SSH alias per line.
|
||||
# Lines starting with "#" are ignored.
|
||||
dash0
|
||||
dash1
|
||||
dash2
|
||||
dash3
|
||||
dash5
|
||||
|
||||
1132
infra/gpu_fleet/gpu_fleet.py
Executable file
1132
infra/gpu_fleet/gpu_fleet.py
Executable file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user