server: non-blocking stream send — stop one slow client stalling the batch

All three engines emitted tokens with blocking_send on the single
decode/coordinator OS thread. A streaming client that drains slower than
generation fills its 64-slot channel, and blocking_send then blocks the whole
thread: under continuous batching one slow consumer stalls every other running
sequence (and in the serial TP/PP path it blocks admission of the next request
too). The whole point of continuous batching is defeated.

Fix: switch to try_send. engine.rs sets a client_stalled flag on Full/Closed,
reaped by is_finished() next iteration; tp_engine/pp_engine emit_text returns
bool and the decode loop breaks with finish_reason "error". When the
sequence/request is dropped its sender drops too, closing the channel so the
client receive loop ends rather than hanging. A slow client now only loses its
own sequence, never the batch.

Verified on dash5: gpt-oss FP8 TP=1 streaming via tp_engine still streams
correctly (SSE chunks, coherent content, no hang); bench-gpt-oss TP=2 5.9ms
TPOT unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-07-01 12:37:32 +08:00
parent cfbd64d206
commit 0314b4f3ac
3 changed files with 60 additions and 20 deletions

View File

@@ -38,6 +38,9 @@ struct Sequence {
seq_slot: Option<usize>, seq_slot: Option<usize>,
sender: tokio::sync::mpsc::Sender<GenerateEvent>, sender: tokio::sync::mpsc::Sender<GenerateEvent>,
prefilled: bool, prefilled: bool,
/// Set when a `try_send` failed (client too slow or gone). The scheduler
/// reaps the sequence next iteration instead of blocking the decode thread.
client_stalled: bool,
eos_token_id: Option<u32>, eos_token_id: Option<u32>,
decode_buffer: Vec<u8>, decode_buffer: Vec<u8>,
created_at: Instant, created_at: Instant,
@@ -370,6 +373,7 @@ impl Engine {
seq_slot: None, seq_slot: None,
sender: req.sender, sender: req.sender,
prefilled: false, prefilled: false,
client_stalled: false,
eos_token_id: self.tokenizer.eos_token_id(), eos_token_id: self.tokenizer.eos_token_id(),
decode_buffer: Vec::new(), decode_buffer: Vec::new(),
created_at: Instant::now(), created_at: Instant::now(),
@@ -392,7 +396,7 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
if tokenizer.eos_token_id() == Some(token_id) { if tokenizer.eos_token_id() == Some(token_id) {
let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer); let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer);
send_token_if_nonempty(seq, tail); send_token_if_nonempty(seq, tail);
let _ = seq.sender.blocking_send(GenerateEvent::Done { try_send_event(seq, GenerateEvent::Done {
finish_reason: "stop".to_string(), finish_reason: "stop".to_string(),
}); });
return; return;
@@ -403,7 +407,7 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer); let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer);
send_token_if_nonempty(seq, text); send_token_if_nonempty(seq, text);
send_token_if_nonempty(seq, tail); send_token_if_nonempty(seq, tail);
let _ = seq.sender.blocking_send(GenerateEvent::Done { try_send_event(seq, GenerateEvent::Done {
finish_reason: "length".to_string(), finish_reason: "length".to_string(),
}); });
} else { } else {
@@ -411,14 +415,34 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
} }
} }
fn send_token_if_nonempty(seq: &Sequence, text: String) { fn send_token_if_nonempty(seq: &mut Sequence, text: String) {
if !text.is_empty() { if !text.is_empty() {
let id = *seq.generated_tokens.last().unwrap_or(&0); let id = *seq.generated_tokens.last().unwrap_or(&0);
let _ = seq.sender.blocking_send(GenerateEvent::Token { id, text }); try_send_event(seq, GenerateEvent::Token { id, text });
}
}
/// Send an event without blocking the shared decode thread. If the client is
/// too slow (channel full) or gone (closed), flag the sequence for eviction
/// instead of blocking — one slow consumer must never stall the whole
/// continuous-batching loop. When the sequence is reaped its `sender` drops,
/// closing the channel so the client's receive loop ends rather than hanging.
fn try_send_event(seq: &mut Sequence, event: GenerateEvent) {
if let Err(err) = seq.sender.try_send(event) {
seq.client_stalled = true;
if let tokio::sync::mpsc::error::TrySendError::Full(_) = err {
eprintln!(
"[scheduler] seq {}: client too slow (stream channel full), evicting",
seq.id
);
}
} }
} }
fn is_finished(seq: &Sequence) -> bool { fn is_finished(seq: &Sequence) -> bool {
if seq.client_stalled {
return true;
}
if seq.generated_tokens.is_empty() { if seq.generated_tokens.is_empty() {
return false; return false;
} }

View File

@@ -268,9 +268,12 @@ pub fn run_pp(
let mut decode_buf: Vec<u8> = Vec::new(); let mut decode_buf: Vec<u8> = Vec::new();
let mut generated = 1usize; let mut generated = 1usize;
emit_text(&tokenizer, &req, next, &mut decode_buf); let mut stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
let finish = loop { let finish = loop {
if stalled {
break "error";
}
if tokenizer.is_eos(next) { if tokenizer.is_eos(next) {
break "stop"; break "stop";
} }
@@ -289,17 +292,17 @@ pub fn run_pp(
send_hidden(&sc, &x, next_peer); send_hidden(&sc, &x, next_peer);
next = token_rx.recv().expect("decode token"); next = token_rx.recv().expect("decode token");
generated += 1; generated += 1;
emit_text(&tokenizer, &req, next, &mut decode_buf); stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
}; };
let tail = tokenizer.flush_decode_stream(&mut decode_buf); let tail = tokenizer.flush_decode_stream(&mut decode_buf);
if !tail.is_empty() { if !tail.is_empty() {
let _ = req.sender.blocking_send(GenerateEvent::Token { let _ = req.sender.try_send(GenerateEvent::Token {
id: next, id: next,
text: tail, text: tail,
}); });
} }
let _ = req.sender.blocking_send(GenerateEvent::Done { let _ = req.sender.try_send(GenerateEvent::Done {
finish_reason: finish.to_string(), finish_reason: finish.to_string(),
}); });
@@ -312,14 +315,19 @@ pub fn run_pp(
} }
/// Stream a token's decoded text to the client (EOS contributes no text). /// Stream a token's decoded text to the client (EOS contributes no text).
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) { /// Returns false if the send would block (client too slow) or the client is
/// gone — the caller stops generating so the coordinator thread is free to
/// admit the next request instead of blocking on one slow consumer.
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) -> bool {
if tokenizer.is_eos(token_id) { if tokenizer.is_eos(token_id) {
return; return true;
} }
let text = tokenizer.decode_token_stream(token_id, buf); let text = tokenizer.decode_token_stream(token_id, buf);
if !text.is_empty() { if !text.is_empty() {
let _ = req return req
.sender .sender
.blocking_send(GenerateEvent::Token { id: token_id, text }); .try_send(GenerateEvent::Token { id: token_id, text })
.is_ok();
} }
true
} }

View File

@@ -294,9 +294,12 @@ pub fn run_tp(
let mut decode_buf: Vec<u8> = Vec::new(); let mut decode_buf: Vec<u8> = Vec::new();
let mut generated = 1usize; let mut generated = 1usize;
emit_text(&tokenizer, &req, next, &mut decode_buf); let mut stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
let finish = loop { let finish = loop {
if stalled {
break "error";
}
if tokenizer.is_eos(next) { if tokenizer.is_eos(next) {
break "stop"; break "stop";
} }
@@ -317,17 +320,17 @@ pub fn run_tp(
next = pick(&logits, &req.sampling, &gen_ids); next = pick(&logits, &req.sampling, &gen_ids);
gen_ids.push(next); gen_ids.push(next);
generated += 1; generated += 1;
emit_text(&tokenizer, &req, next, &mut decode_buf); stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
}; };
let tail = tokenizer.flush_decode_stream(&mut decode_buf); let tail = tokenizer.flush_decode_stream(&mut decode_buf);
if !tail.is_empty() { if !tail.is_empty() {
let _ = req.sender.blocking_send(GenerateEvent::Token { let _ = req.sender.try_send(GenerateEvent::Token {
id: next, id: next,
text: tail, text: tail,
}); });
} }
let _ = req.sender.blocking_send(GenerateEvent::Done { let _ = req.sender.try_send(GenerateEvent::Done {
finish_reason: finish.to_string(), finish_reason: finish.to_string(),
}); });
@@ -340,14 +343,19 @@ pub fn run_tp(
} }
/// Stream a token's decoded text to the client (EOS contributes no text). /// Stream a token's decoded text to the client (EOS contributes no text).
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) { /// Returns false if the send would block (client too slow) or the client is
/// gone — the caller stops generating so the serial coordinator thread is free
/// to admit the next request instead of blocking on one slow consumer.
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) -> bool {
if tokenizer.is_eos(token_id) { if tokenizer.is_eos(token_id) {
return; return true;
} }
let text = tokenizer.decode_token_stream(token_id, buf); let text = tokenizer.decode_token_stream(token_id, buf);
if !text.is_empty() { if !text.is_empty() {
let _ = req return req
.sender .sender
.blocking_send(GenerateEvent::Token { id: token_id, text }); .try_send(GenerateEvent::Token { id: token_id, text })
.is_ok();
} }
true
} }