server: non-blocking stream send — stop one slow client stalling the batch
All three engines emitted tokens with blocking_send on the single decode/coordinator OS thread. A streaming client that drains slower than generation fills its 64-slot channel, and blocking_send then blocks the whole thread: under continuous batching one slow consumer stalls every other running sequence (and in the serial TP/PP path it blocks admission of the next request too). The whole point of continuous batching is defeated. Fix: switch to try_send. engine.rs sets a client_stalled flag on Full/Closed, reaped by is_finished() next iteration; tp_engine/pp_engine emit_text returns bool and the decode loop breaks with finish_reason "error". When the sequence/request is dropped its sender drops too, closing the channel so the client receive loop ends rather than hanging. A slow client now only loses its own sequence, never the batch. Verified on dash5: gpt-oss FP8 TP=1 streaming via tp_engine still streams correctly (SSE chunks, coherent content, no hang); bench-gpt-oss TP=2 5.9ms TPOT unchanged. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -38,6 +38,9 @@ struct Sequence {
|
|||||||
seq_slot: Option<usize>,
|
seq_slot: Option<usize>,
|
||||||
sender: tokio::sync::mpsc::Sender<GenerateEvent>,
|
sender: tokio::sync::mpsc::Sender<GenerateEvent>,
|
||||||
prefilled: bool,
|
prefilled: bool,
|
||||||
|
/// Set when a `try_send` failed (client too slow or gone). The scheduler
|
||||||
|
/// reaps the sequence next iteration instead of blocking the decode thread.
|
||||||
|
client_stalled: bool,
|
||||||
eos_token_id: Option<u32>,
|
eos_token_id: Option<u32>,
|
||||||
decode_buffer: Vec<u8>,
|
decode_buffer: Vec<u8>,
|
||||||
created_at: Instant,
|
created_at: Instant,
|
||||||
@@ -370,6 +373,7 @@ impl Engine {
|
|||||||
seq_slot: None,
|
seq_slot: None,
|
||||||
sender: req.sender,
|
sender: req.sender,
|
||||||
prefilled: false,
|
prefilled: false,
|
||||||
|
client_stalled: false,
|
||||||
eos_token_id: self.tokenizer.eos_token_id(),
|
eos_token_id: self.tokenizer.eos_token_id(),
|
||||||
decode_buffer: Vec::new(),
|
decode_buffer: Vec::new(),
|
||||||
created_at: Instant::now(),
|
created_at: Instant::now(),
|
||||||
@@ -392,7 +396,7 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
|
|||||||
if tokenizer.eos_token_id() == Some(token_id) {
|
if tokenizer.eos_token_id() == Some(token_id) {
|
||||||
let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer);
|
let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer);
|
||||||
send_token_if_nonempty(seq, tail);
|
send_token_if_nonempty(seq, tail);
|
||||||
let _ = seq.sender.blocking_send(GenerateEvent::Done {
|
try_send_event(seq, GenerateEvent::Done {
|
||||||
finish_reason: "stop".to_string(),
|
finish_reason: "stop".to_string(),
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
@@ -403,7 +407,7 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
|
|||||||
let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer);
|
let tail = tokenizer.flush_decode_stream(&mut seq.decode_buffer);
|
||||||
send_token_if_nonempty(seq, text);
|
send_token_if_nonempty(seq, text);
|
||||||
send_token_if_nonempty(seq, tail);
|
send_token_if_nonempty(seq, tail);
|
||||||
let _ = seq.sender.blocking_send(GenerateEvent::Done {
|
try_send_event(seq, GenerateEvent::Done {
|
||||||
finish_reason: "length".to_string(),
|
finish_reason: "length".to_string(),
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
@@ -411,14 +415,34 @@ fn emit_token(tokenizer: &Tokenizer, seq: &mut Sequence, token_id: u32) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn send_token_if_nonempty(seq: &Sequence, text: String) {
|
fn send_token_if_nonempty(seq: &mut Sequence, text: String) {
|
||||||
if !text.is_empty() {
|
if !text.is_empty() {
|
||||||
let id = *seq.generated_tokens.last().unwrap_or(&0);
|
let id = *seq.generated_tokens.last().unwrap_or(&0);
|
||||||
let _ = seq.sender.blocking_send(GenerateEvent::Token { id, text });
|
try_send_event(seq, GenerateEvent::Token { id, text });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send an event without blocking the shared decode thread. If the client is
|
||||||
|
/// too slow (channel full) or gone (closed), flag the sequence for eviction
|
||||||
|
/// instead of blocking — one slow consumer must never stall the whole
|
||||||
|
/// continuous-batching loop. When the sequence is reaped its `sender` drops,
|
||||||
|
/// closing the channel so the client's receive loop ends rather than hanging.
|
||||||
|
fn try_send_event(seq: &mut Sequence, event: GenerateEvent) {
|
||||||
|
if let Err(err) = seq.sender.try_send(event) {
|
||||||
|
seq.client_stalled = true;
|
||||||
|
if let tokio::sync::mpsc::error::TrySendError::Full(_) = err {
|
||||||
|
eprintln!(
|
||||||
|
"[scheduler] seq {}: client too slow (stream channel full), evicting",
|
||||||
|
seq.id
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_finished(seq: &Sequence) -> bool {
|
fn is_finished(seq: &Sequence) -> bool {
|
||||||
|
if seq.client_stalled {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if seq.generated_tokens.is_empty() {
|
if seq.generated_tokens.is_empty() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -268,9 +268,12 @@ pub fn run_pp(
|
|||||||
|
|
||||||
let mut decode_buf: Vec<u8> = Vec::new();
|
let mut decode_buf: Vec<u8> = Vec::new();
|
||||||
let mut generated = 1usize;
|
let mut generated = 1usize;
|
||||||
emit_text(&tokenizer, &req, next, &mut decode_buf);
|
let mut stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||||
|
|
||||||
let finish = loop {
|
let finish = loop {
|
||||||
|
if stalled {
|
||||||
|
break "error";
|
||||||
|
}
|
||||||
if tokenizer.is_eos(next) {
|
if tokenizer.is_eos(next) {
|
||||||
break "stop";
|
break "stop";
|
||||||
}
|
}
|
||||||
@@ -289,17 +292,17 @@ pub fn run_pp(
|
|||||||
send_hidden(&sc, &x, next_peer);
|
send_hidden(&sc, &x, next_peer);
|
||||||
next = token_rx.recv().expect("decode token");
|
next = token_rx.recv().expect("decode token");
|
||||||
generated += 1;
|
generated += 1;
|
||||||
emit_text(&tokenizer, &req, next, &mut decode_buf);
|
stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||||
};
|
};
|
||||||
|
|
||||||
let tail = tokenizer.flush_decode_stream(&mut decode_buf);
|
let tail = tokenizer.flush_decode_stream(&mut decode_buf);
|
||||||
if !tail.is_empty() {
|
if !tail.is_empty() {
|
||||||
let _ = req.sender.blocking_send(GenerateEvent::Token {
|
let _ = req.sender.try_send(GenerateEvent::Token {
|
||||||
id: next,
|
id: next,
|
||||||
text: tail,
|
text: tail,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
let _ = req.sender.blocking_send(GenerateEvent::Done {
|
let _ = req.sender.try_send(GenerateEvent::Done {
|
||||||
finish_reason: finish.to_string(),
|
finish_reason: finish.to_string(),
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -312,14 +315,19 @@ pub fn run_pp(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Stream a token's decoded text to the client (EOS contributes no text).
|
/// Stream a token's decoded text to the client (EOS contributes no text).
|
||||||
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) {
|
/// Returns false if the send would block (client too slow) or the client is
|
||||||
|
/// gone — the caller stops generating so the coordinator thread is free to
|
||||||
|
/// admit the next request instead of blocking on one slow consumer.
|
||||||
|
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) -> bool {
|
||||||
if tokenizer.is_eos(token_id) {
|
if tokenizer.is_eos(token_id) {
|
||||||
return;
|
return true;
|
||||||
}
|
}
|
||||||
let text = tokenizer.decode_token_stream(token_id, buf);
|
let text = tokenizer.decode_token_stream(token_id, buf);
|
||||||
if !text.is_empty() {
|
if !text.is_empty() {
|
||||||
let _ = req
|
return req
|
||||||
.sender
|
.sender
|
||||||
.blocking_send(GenerateEvent::Token { id: token_id, text });
|
.try_send(GenerateEvent::Token { id: token_id, text })
|
||||||
|
.is_ok();
|
||||||
}
|
}
|
||||||
|
true
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -294,9 +294,12 @@ pub fn run_tp(
|
|||||||
|
|
||||||
let mut decode_buf: Vec<u8> = Vec::new();
|
let mut decode_buf: Vec<u8> = Vec::new();
|
||||||
let mut generated = 1usize;
|
let mut generated = 1usize;
|
||||||
emit_text(&tokenizer, &req, next, &mut decode_buf);
|
let mut stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||||
|
|
||||||
let finish = loop {
|
let finish = loop {
|
||||||
|
if stalled {
|
||||||
|
break "error";
|
||||||
|
}
|
||||||
if tokenizer.is_eos(next) {
|
if tokenizer.is_eos(next) {
|
||||||
break "stop";
|
break "stop";
|
||||||
}
|
}
|
||||||
@@ -317,17 +320,17 @@ pub fn run_tp(
|
|||||||
next = pick(&logits, &req.sampling, &gen_ids);
|
next = pick(&logits, &req.sampling, &gen_ids);
|
||||||
gen_ids.push(next);
|
gen_ids.push(next);
|
||||||
generated += 1;
|
generated += 1;
|
||||||
emit_text(&tokenizer, &req, next, &mut decode_buf);
|
stalled = !emit_text(&tokenizer, &req, next, &mut decode_buf);
|
||||||
};
|
};
|
||||||
|
|
||||||
let tail = tokenizer.flush_decode_stream(&mut decode_buf);
|
let tail = tokenizer.flush_decode_stream(&mut decode_buf);
|
||||||
if !tail.is_empty() {
|
if !tail.is_empty() {
|
||||||
let _ = req.sender.blocking_send(GenerateEvent::Token {
|
let _ = req.sender.try_send(GenerateEvent::Token {
|
||||||
id: next,
|
id: next,
|
||||||
text: tail,
|
text: tail,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
let _ = req.sender.blocking_send(GenerateEvent::Done {
|
let _ = req.sender.try_send(GenerateEvent::Done {
|
||||||
finish_reason: finish.to_string(),
|
finish_reason: finish.to_string(),
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -340,14 +343,19 @@ pub fn run_tp(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Stream a token's decoded text to the client (EOS contributes no text).
|
/// Stream a token's decoded text to the client (EOS contributes no text).
|
||||||
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) {
|
/// Returns false if the send would block (client too slow) or the client is
|
||||||
|
/// gone — the caller stops generating so the serial coordinator thread is free
|
||||||
|
/// to admit the next request instead of blocking on one slow consumer.
|
||||||
|
fn emit_text(tokenizer: &Tokenizer, req: &GenerateRequest, token_id: u32, buf: &mut Vec<u8>) -> bool {
|
||||||
if tokenizer.is_eos(token_id) {
|
if tokenizer.is_eos(token_id) {
|
||||||
return;
|
return true;
|
||||||
}
|
}
|
||||||
let text = tokenizer.decode_token_stream(token_id, buf);
|
let text = tokenizer.decode_token_stream(token_id, buf);
|
||||||
if !text.is_empty() {
|
if !text.is_empty() {
|
||||||
let _ = req
|
return req
|
||||||
.sender
|
.sender
|
||||||
.blocking_send(GenerateEvent::Token { id: token_id, text });
|
.try_send(GenerateEvent::Token { id: token_id, text })
|
||||||
|
.is_ok();
|
||||||
}
|
}
|
||||||
|
true
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user