server: non-blocking stream send — stop one slow client stalling the batch
All three engines emitted tokens with blocking_send on the single decode/coordinator OS thread. A streaming client that drains slower than generation fills its 64-slot channel, and blocking_send then blocks the whole thread: under continuous batching one slow consumer stalls every other running sequence (and in the serial TP/PP path it blocks admission of the next request too). The whole point of continuous batching is defeated. Fix: switch to try_send. engine.rs sets a client_stalled flag on Full/Closed, reaped by is_finished() next iteration; tp_engine/pp_engine emit_text returns bool and the decode loop breaks with finish_reason "error". When the sequence/request is dropped its sender drops too, closing the channel so the client receive loop ends rather than hanging. A slow client now only loses its own sequence, never the batch. Verified on dash5: gpt-oss FP8 TP=1 streaming via tp_engine still streams correctly (SSE chunks, coherent content, no hang); bench-gpt-oss TP=2 5.9ms TPOT unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -38,6 +38,9 @@ struct Sequence {
|
||||
seq_slot: Option<usize>,
|
||||
sender: tokio::sync::mpsc::Sender<GenerateEvent>,
|
||||
prefilled: bool,
|
||||
/// Set when a `try_send` failed (client too slow or gone). The scheduler
|
||||
/// reaps the sequence next iteration instead of blocking the decode thread.
|
||||
client_stalled: bool,
|
||||
eos_token_id: Option<u32>,
|
||||
decode_buffer: Vec<u8>,
|
||||
created_at: Instant,
|
||||
@@ -370,6 +373,7 @@ impl Engine {
|
||||
seq_slot: None,
|
||||
sender: req.sender,
|
||||
prefilled: false,
|
||||
client_stalled: false,
|
||||
eos_token_id: self.tokenizer.eos_token_id(),
|
||||
decode_buffer: Vec::new(),
|
||||
created_at: Instant::now(),
|
||||
@@ -392,7 +396,7 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
|
||||
if tokenizer.eos_token_id() == Some(token_id) {
|
||||
let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer);
|
||||
send_token_if_nonempty(seq, tail);
|
||||
let _ = seq.sender.blocking_send(GenerateEvent::Done {
|
||||
try_send_event(seq, GenerateEvent::Done {
|
||||
finish_reason: "stop".to_string(),
|
||||
});
|
||||
return;
|
||||
@@ -403,7 +407,7 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
|
||||
let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer);
|
||||
send_token_if_nonempty(seq, text);
|
||||
send_token_if_nonempty(seq, tail);
|
||||
let _ = seq.sender.blocking_send(GenerateEvent::Done {
|
||||
try_send_event(seq, GenerateEvent::Done {
|
||||
finish_reason: "length".to_string(),
|
||||
});
|
||||
} else {
|
||||
@@ -411,14 +415,34 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
|
||||
}
|
||||
}
|
||||
|
||||
fn send_token_if_nonempty(seq: &Sequence, text: String) {
|
||||
fn send_token_if_nonempty(seq: &mut Sequence, text: String) {
|
||||
if !text.is_empty() {
|
||||
let id = *seq.generated_tokens.last().unwrap_or(&0);
|
||||
let _ = seq.sender.blocking_send(GenerateEvent::Token { id, text });
|
||||
try_send_event(seq, GenerateEvent::Token { id, text });
|
||||
}
|
||||
}
|
||||
|
||||
/// Send an event without blocking the shared decode thread. If the client is
|
||||
/// too slow (channel full) or gone (closed), flag the sequence for eviction
|
||||
/// instead of blocking — one slow consumer must never stall the whole
|
||||
/// continuous-batching loop. When the sequence is reaped its `sender` drops,
|
||||
/// closing the channel so the client's receive loop ends rather than hanging.
|
||||
fn try_send_event(seq: &mut Sequence, event: GenerateEvent) {
|
||||
if let Err(err) = seq.sender.try_send(event) {
|
||||
seq.client_stalled = true;
|
||||
if let tokio::sync::mpsc::error::TrySendError::Full(_) = err {
|
||||
eprintln!(
|
||||
"[scheduler] seq {}: client too slow (stream channel full), evicting",
|
||||
seq.id
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_finished(seq: &Sequence) -> bool {
|
||||
if seq.client_stalled {
|
||||
return true;
|
||||
}
|
||||
if seq.generated_tokens.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -268,9 +268,12 @@ pub fn run_pp(
|
||||
|
||||
let mut decode_buf: Vec<u8> = Vec::new();
|
||||
let mut generated = 1usize;
|
||||
emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||
let mut stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||
|
||||
let finish = loop {
|
||||
if stalled {
|
||||
break "error";
|
||||
}
|
||||
if tokenizer.is_eos(next) {
|
||||
break "stop";
|
||||
}
|
||||
@@ -289,17 +292,17 @@ pub fn run_pp(
|
||||
send_hidden(&sc, &x, next_peer);
|
||||
next = token_rx.recv().expect("decode token");
|
||||
generated += 1;
|
||||
emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||
stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||
};
|
||||
|
||||
let tail = tokenizer.flush_decode_stream(&mut decode_buf);
|
||||
if !tail.is_empty() {
|
||||
let _ = req.sender.blocking_send(GenerateEvent::Token {
|
||||
let _ = req.sender.try_send(GenerateEvent::Token {
|
||||
id: next,
|
||||
text: tail,
|
||||
});
|
||||
}
|
||||
let _ = req.sender.blocking_send(GenerateEvent::Done {
|
||||
let _ = req.sender.try_send(GenerateEvent::Done {
|
||||
finish_reason: finish.to_string(),
|
||||
});
|
||||
|
||||
@@ -312,14 +315,19 @@ pub fn run_pp(
|
||||
}
|
||||
|
||||
/// Stream a token's decoded text to the client (EOS contributes no text).
|
||||
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) {
|
||||
/// Returns false if the send would block (client too slow) or the client is
|
||||
/// gone — the caller stops generating so the coordinator thread is free to
|
||||
/// admit the next request instead of blocking on one slow consumer.
|
||||
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) -> bool {
|
||||
if tokenizer.is_eos(token_id) {
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
let text = tokenizer.decode_token_stream(token_id, buf);
|
||||
if !text.is_empty() {
|
||||
let _ = req
|
||||
return req
|
||||
.sender
|
||||
.blocking_send(GenerateEvent::Token { id: token_id, text });
|
||||
.try_send(GenerateEvent::Token { id: token_id, text })
|
||||
.is_ok();
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
@@ -294,9 +294,12 @@ pub fn run_tp(
|
||||
|
||||
let mut decode_buf: Vec<u8> = Vec::new();
|
||||
let mut generated = 1usize;
|
||||
emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||
let mut stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||
|
||||
let finish = loop {
|
||||
if stalled {
|
||||
break "error";
|
||||
}
|
||||
if tokenizer.is_eos(next) {
|
||||
break "stop";
|
||||
}
|
||||
@@ -317,17 +320,17 @@ pub fn run_tp(
|
||||
next = pick(&logits, &req.sampling, &gen_ids);
|
||||
gen_ids.push(next);
|
||||
generated += 1;
|
||||
emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||
stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||
};
|
||||
|
||||
let tail = tokenizer.flush_decode_stream(&mut decode_buf);
|
||||
if !tail.is_empty() {
|
||||
let _ = req.sender.blocking_send(GenerateEvent::Token {
|
||||
let _ = req.sender.try_send(GenerateEvent::Token {
|
||||
id: next,
|
||||
text: tail,
|
||||
});
|
||||
}
|
||||
let _ = req.sender.blocking_send(GenerateEvent::Done {
|
||||
let _ = req.sender.try_send(GenerateEvent::Done {
|
||||
finish_reason: finish.to_string(),
|
||||
});
|
||||
|
||||
@@ -340,14 +343,19 @@ pub fn run_tp(
|
||||
}
|
||||
|
||||
/// Stream a token's decoded text to the client (EOS contributes no text).
|
||||
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) {
|
||||
/// Returns false if the send would block (client too slow) or the client is
|
||||
/// gone — the caller stops generating so the serial coordinator thread is free
|
||||
/// to admit the next request instead of blocking on one slow consumer.
|
||||
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) -> bool {
|
||||
if tokenizer.is_eos(token_id) {
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
let text = tokenizer.decode_token_stream(token_id, buf);
|
||||
if !text.is_empty() {
|
||||
let _ = req
|
||||
return req
|
||||
.sender
|
||||
.blocking_send(GenerateEvent::Token { id: token_id, text });
|
||||
.try_send(GenerateEvent::Token { id: token_id, text })
|
||||
.is_ok();
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user