diff --git a/crates/xserv-server/src/main.rs b/crates/xserv-server/src/main.rs index aee0d53..497ca8d 100644 --- a/crates/xserv-server/src/main.rs +++ b/crates/xserv-server/src/main.rs @@ -87,11 +87,14 @@ async fn main() { let (tx, rx) = mpsc::channel::(); let model_dir_clone = model_dir.clone(); + // gpt-oss is only implemented in the TP engine; route it there even at + // tp=1 (single-rank world) so quantized models can serve on one GPU. + let is_gpt_oss = model_config.model_type.as_deref() == Some("gpt_oss"); std::thread::spawn(move || { if pp > 1 { // Pipeline-parallel path: stage-0 coordinator + worker stage threads. pp_engine::run_pp(&model_dir_clone, pp, max_seq_len, rx); - } else if tp <= 1 { + } else if tp <= 1 && !is_gpt_oss { let mut engine = engine::Engine::load_with_swap(&model_dir_clone, max_batch, max_seq_len, swap_space_gb); engine.run(rx); } else { diff --git a/crates/xserv-server/src/tp_engine.rs b/crates/xserv-server/src/tp_engine.rs index 36602e6..be6453e 100644 --- a/crates/xserv-server/src/tp_engine.rs +++ b/crates/xserv-server/src/tp_engine.rs @@ -120,7 +120,9 @@ fn worker_loop( /// Run the TP coordinator (rank 0) on the calling thread. Spawns worker ranks /// internally and consumes generation requests from `rx`. pub fn run_tp(model_dir: &Path, world: usize, max_seq_len: usize, rx: mpsc::Receiver) { - assert!(world >= 2, "run_tp requires world >= 2"); + // world=1 is a valid single-rank configuration (gpt-oss has no + // single-GPU engine path; NCCL init and all_reduce no-op at world=1). + assert!(world >= 1, "run_tp requires world >= 1"); let config = ModelConfig::from_file(&model_dir.join("config.json")); assert!( config.num_kv_heads() % world == 0, diff --git a/tools/eval_gsm8k_fast.py b/tools/eval_gsm8k_fast.py index b37c5dd..268bdda 100644 --- a/tools/eval_gsm8k_fast.py +++ b/tools/eval_gsm8k_fast.py @@ -81,7 +81,8 @@ def main(): parser.add_argument("--max-tokens", type=int, default=512, help="Max generation tokens") parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism") parser.add_argument("--offset", type=int, default=0, help="Start from problem N") - parser.add_argument("--gpu", type=int, default=0, help="GPU device index") + parser.add_argument("--gpu", type=str, default="0", + help="CUDA_VISIBLE_DEVICES value, e.g. '0' or '2,3' (must cover --tp ranks)") args = parser.parse_args() if not DATA_PATH.exists():