server: serve gpt-oss on a single GPU via the TP engine (world=1)
gpt-oss has no single-GPU engine path, so --tp 1 fell through to the
Qwen3-only engine and every request 503'd. Route gpt_oss to run_tp
even at tp=1: NCCL world-1 init works and all_reduce already no-ops
(bench-gpt-oss --tp 1 exercised this path). Quantized gpt-oss (22 GB
FP8 / 13 GB MXFP4) now serves on one 32 GB 5090.
Also fix eval_gsm8k_fast.py --gpu to accept a device list ("2,3"):
it was type=int, so any --tp 2 run pinned CUDA_VISIBLE_DEVICES to one
GPU and rank 1's set_device panicked while rank 0 spun in NCCL init.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -87,11 +87,14 @@ async fn main() {
|
|||||||
let (tx, rx) = mpsc::channel::<GenerateRequest>();
|
let (tx, rx) = mpsc::channel::<GenerateRequest>();
|
||||||
|
|
||||||
let model_dir_clone = model_dir.clone();
|
let model_dir_clone = model_dir.clone();
|
||||||
|
// gpt-oss is only implemented in the TP engine; route it there even at
|
||||||
|
// tp=1 (single-rank world) so quantized models can serve on one GPU.
|
||||||
|
let is_gpt_oss = model_config.model_type.as_deref() == Some("gpt_oss");
|
||||||
std::thread::spawn(move || {
|
std::thread::spawn(move || {
|
||||||
if pp > 1 {
|
if pp > 1 {
|
||||||
// Pipeline-parallel path: stage-0 coordinator + worker stage threads.
|
// Pipeline-parallel path: stage-0 coordinator + worker stage threads.
|
||||||
pp_engine::run_pp(&model_dir_clone, pp, max_seq_len, rx);
|
pp_engine::run_pp(&model_dir_clone, pp, max_seq_len, rx);
|
||||||
} else if tp <= 1 {
|
} else if tp <= 1 && !is_gpt_oss {
|
||||||
let mut engine = engine::Engine::load_with_swap(&model_dir_clone, max_batch, max_seq_len, swap_space_gb);
|
let mut engine = engine::Engine::load_with_swap(&model_dir_clone, max_batch, max_seq_len, swap_space_gb);
|
||||||
engine.run(rx);
|
engine.run(rx);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -120,7 +120,9 @@ fn worker_loop(
|
|||||||
/// Run the TP coordinator (rank 0) on the calling thread. Spawns worker ranks
|
/// Run the TP coordinator (rank 0) on the calling thread. Spawns worker ranks
|
||||||
/// internally and consumes generation requests from `rx`.
|
/// internally and consumes generation requests from `rx`.
|
||||||
pub fn run_tp(model_dir: &Path, world: usize, max_seq_len: usize, rx: mpsc::Receiver<GenerateRequest>) {
|
pub fn run_tp(model_dir: &Path, world: usize, max_seq_len: usize, rx: mpsc::Receiver<GenerateRequest>) {
|
||||||
assert!(world >= 2, "run_tp requires world >= 2");
|
// world=1 is a valid single-rank configuration (gpt-oss has no
|
||||||
|
// single-GPU engine path; NCCL init and all_reduce no-op at world=1).
|
||||||
|
assert!(world >= 1, "run_tp requires world >= 1");
|
||||||
let config = ModelConfig::from_file(&model_dir.join("config.json"));
|
let config = ModelConfig::from_file(&model_dir.join("config.json"));
|
||||||
assert!(
|
assert!(
|
||||||
config.num_kv_heads() % world == 0,
|
config.num_kv_heads() % world == 0,
|
||||||
|
|||||||
@@ -81,7 +81,8 @@ def main():
|
|||||||
parser.add_argument("--max-tokens", type=int, default=512, help="Max generation tokens")
|
parser.add_argument("--max-tokens", type=int, default=512, help="Max generation tokens")
|
||||||
parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism")
|
parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism")
|
||||||
parser.add_argument("--offset", type=int, default=0, help="Start from problem N")
|
parser.add_argument("--offset", type=int, default=0, help="Start from problem N")
|
||||||
parser.add_argument("--gpu", type=int, default=0, help="GPU device index")
|
parser.add_argument("--gpu", type=str, default="0",
|
||||||
|
help="CUDA_VISIBLE_DEVICES value, e.g. '0' or '2,3' (must cover --tp ranks)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not DATA_PATH.exists():
|
if not DATA_PATH.exists():
|
||||||
|
|||||||
Reference in New Issue
Block a user