server: non-blocking stream send — stop one slow client stalling the batch

All three engines emitted tokens with blocking_send on the single
decode/coordinator OS thread. A streaming client that drains slower than
generation fills its 64-slot channel, and blocking_send then blocks the whole
thread: under continuous batching one slow consumer stalls every other running
sequence (and in the serial TP/PP path it blocks admission of the next request
too). The whole point of continuous batching is defeated.

Fix: switch to try_send. engine.rs sets a client_stalled flag on Full/Closed,
reaped by is_finished() next iteration; tp_engine/pp_engine emit_text returns
bool and the decode loop breaks with finish_reason "error". When the
sequence/request is dropped its sender drops too, closing the channel so the
client receive loop ends rather than hanging. A slow client now only loses its
own sequence, never the batch.

Verified on dash5: gpt-oss FP8 TP=1 streaming via tp_engine still streams
correctly (SSE chunks, coherent content, no hang); bench-gpt-oss TP=2 5.9ms
TPOT unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-07-01 12:37:32 +08:00
parent cfbd64d206
commit 0314b4f3ac
3 changed files with 60 additions and 20 deletions

View File

@@ -38,6 +38,9 @@ struct Sequence {
seq_slot: Option<usize>,
sender: tokio::sync::mpsc::Sender<GenerateEvent>,
prefilled: bool,
/// Set when a `try_send` failed (client too slow or gone). The scheduler
/// reaps the sequence next iteration instead of blocking the decode thread.
client_stalled: bool,
eos_token_id: Option<u32>,
decode_buffer: Vec<u8>,
created_at: Instant,
@@ -370,6 +373,7 @@ impl Engine {
seq_slot: None,
sender: req.sender,
prefilled: false,
client_stalled: false,
eos_token_id: self.tokenizer.eos_token_id(),
decode_buffer: Vec::new(),
created_at: Instant::now(),
@@ -392,7 +396,7 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
if tokenizer.eos_token_id() == Some(token_id) {
let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer);
send_token_if_nonempty(seq, tail);
let _ = seq.sender.blocking_send(GenerateEvent::Done {
try_send_event(seq, GenerateEvent::Done {
finish_reason: "stop".to_string(),
});
return;
@@ -403,7 +407,7 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer);
send_token_if_nonempty(seq, text);
send_token_if_nonempty(seq, tail);
let _ = seq.sender.blocking_send(GenerateEvent::Done {
try_send_event(seq, GenerateEvent::Done {
finish_reason: "length".to_string(),
});
} else {
@@ -411,14 +415,34 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
}
}
fn send_token_if_nonempty(seq: &Sequence, text: String) {
fn send_token_if_nonempty(seq: &mut Sequence, text: String) {
if !text.is_empty() {
let id = *seq.generated_tokens.last().unwrap_or(&0);
let _ = seq.sender.blocking_send(GenerateEvent::Token { id, text });
try_send_event(seq, GenerateEvent::Token { id, text });
}
}
/// Send an event without blocking the shared decode thread. If the client is
/// too slow (channel full) or gone (closed), flag the sequence for eviction
/// instead of blocking — one slow consumer must never stall the whole
/// continuous-batching loop. When the sequence is reaped its `sender` drops,
/// closing the channel so the client's receive loop ends rather than hanging.
fn try_send_event(seq: &mut Sequence, event: GenerateEvent) {
if let Err(err) = seq.sender.try_send(event) {
seq.client_stalled = true;
if let tokio::sync::mpsc::error::TrySendError::Full(_) = err {
eprintln!(
"[scheduler] seq {}: client too slow (stream channel full), evicting",
seq.id
);
}
}
}
fn is_finished(seq: &Sequence) -> bool {
if seq.client_stalled {
return true;
}
if seq.generated_tokens.is_empty() {
return false;
}

View File

@@ -268,9 +268,12 @@ pub fn run_pp(
let mut decode_buf: Vec<u8> = Vec::new();
let mut generated = 1usize;
emit_text(&tokenizer, &req, next, &mut decode_buf);
let mut stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
let finish = loop {
if stalled {
break "error";
}
if tokenizer.is_eos(next) {
break "stop";
}
@@ -289,17 +292,17 @@ pub fn run_pp(
send_hidden(&sc, &x, next_peer);
next = token_rx.recv().expect("decode token");
generated += 1;
emit_text(&tokenizer, &req, next, &mut decode_buf);
stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
};
let tail = tokenizer.flush_decode_stream(&mut decode_buf);
if !tail.is_empty() {
let _ = req.sender.blocking_send(GenerateEvent::Token {
let _ = req.sender.try_send(GenerateEvent::Token {
id: next,
text: tail,
});
}
let _ = req.sender.blocking_send(GenerateEvent::Done {
let _ = req.sender.try_send(GenerateEvent::Done {
finish_reason: finish.to_string(),
});
@@ -312,14 +315,19 @@ pub fn run_pp(
}
/// Stream a token's decoded text to the client (EOS contributes no text).
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) {
/// Returns false if the send would block (client too slow) or the client is
/// gone — the caller stops generating so the coordinator thread is free to
/// admit the next request instead of blocking on one slow consumer.
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) -> bool {
if tokenizer.is_eos(token_id) {
return;
return true;
}
let text = tokenizer.decode_token_stream(token_id, buf);
if !text.is_empty() {
let _ = req
return req
.sender
.blocking_send(GenerateEvent::Token { id: token_id, text });
.try_send(GenerateEvent::Token { id: token_id, text })
.is_ok();
}
true
}

View File

@@ -294,9 +294,12 @@ pub fn run_tp(
let mut decode_buf: Vec<u8> = Vec::new();
let mut generated = 1usize;
emit_text(&tokenizer, &req, next, &mut decode_buf);
let mut stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
let finish = loop {
if stalled {
break "error";
}
if tokenizer.is_eos(next) {
break "stop";
}
@@ -317,17 +320,17 @@ pub fn run_tp(
next = pick(&logits, &req.sampling, &gen_ids);
gen_ids.push(next);
generated += 1;
emit_text(&tokenizer, &req, next, &mut decode_buf);
stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
};
let tail = tokenizer.flush_decode_stream(&mut decode_buf);
if !tail.is_empty() {
let _ = req.sender.blocking_send(GenerateEvent::Token {
let _ = req.sender.try_send(GenerateEvent::Token {
id: next,
text: tail,
});
}
let _ = req.sender.blocking_send(GenerateEvent::Done {
let _ = req.sender.try_send(GenerateEvent::Done {
finish_reason: finish.to_string(),
});
@@ -340,14 +343,19 @@ pub fn run_tp(
}
/// Stream a token's decoded text to the client (EOS contributes no text).
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) {
/// Returns false if the send would block (client too slow) or the client is
/// gone — the caller stops generating so the serial coordinator thread is free
/// to admit the next request instead of blocking on one slow consumer.
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) -> bool {
if tokenizer.is_eos(token_id) {
return;
return true;
}
let text = tokenizer.decode_token_stream(token_id, buf);
if !text.is_empty() {
let _ = req
return req
.sender
.blocking_send(GenerateEvent::Token { id: token_id, text });
.try_send(GenerateEvent::Token { id: token_id, text })
.is_ok();
}
true
}