fix(xserv-chat): UTF-8/CJK-aware line input
Cooked-mode read_line() left line editing to the terminal, so Backspace on a multi-byte 汉字/かな/한글 deleted a byte (or behaved inconsistently across TTYs). Replace with a raw-mode reader (libc termios): Backspace pops a whole char, multi-byte input is reassembled from its continuation bytes, and a full-line redraw renders double-width glyphs correctly. Non-TTY input falls back to a plain read; raw mode is restored after each line. libc is already a locked transitive dep, so this builds offline. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -18,6 +18,7 @@ license = "MIT"
|
||||
[workspace.dependencies]
|
||||
half = "2"
|
||||
smallvec = "1"
|
||||
libc = "0.2"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
safetensors = "0.5"
|
||||
|
||||
@@ -10,6 +10,7 @@ xserv-kernels = { path = "../xserv-kernels" }
|
||||
xserv-tokenizer = { path = "../xserv-tokenizer" }
|
||||
xserv-distributed = { path = "../xserv-distributed" }
|
||||
half.workspace = true
|
||||
libc.workspace = true
|
||||
smallvec.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::io::{self, IsTerminal, Write};
|
||||
use std::io::{self, IsTerminal, Read, Write};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use xserv_model::{loader, sample, ModelConfig, PagedKVCache, Qwen3, SamplingParams, BLOCK_SIZE};
|
||||
@@ -22,6 +22,139 @@ enum Finish {
|
||||
Length,
|
||||
}
|
||||
|
||||
enum Line {
|
||||
Text(String),
|
||||
Eof,
|
||||
}
|
||||
|
||||
/// RAII terminal raw-mode guard. Disables canonical mode + echo (keeps output
|
||||
/// post-processing and signals), so we read keystrokes ourselves and edit the
|
||||
/// line UTF-8-aware. Restores the original termios on drop.
|
||||
struct RawMode {
|
||||
orig: libc::termios,
|
||||
}
|
||||
|
||||
impl RawMode {
|
||||
fn enable() -> Option<Self> {
|
||||
unsafe {
|
||||
let mut orig: libc::termios = std::mem::zeroed();
|
||||
if libc::tcgetattr(libc::STDIN_FILENO, &mut orig) != 0 {
|
||||
return None;
|
||||
}
|
||||
let mut raw = orig;
|
||||
raw.c_lflag &= !(libc::ICANON | libc::ECHO);
|
||||
raw.c_cc[libc::VMIN as usize] = 1;
|
||||
raw.c_cc[libc::VTIME as usize] = 0;
|
||||
if libc::tcsetattr(libc::STDIN_FILENO, libc::TCSANOW, &raw) != 0 {
|
||||
return None;
|
||||
}
|
||||
Some(RawMode { orig })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RawMode {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
libc::tcsetattr(libc::STDIN_FILENO, libc::TCSANOW, &self.orig);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Read one line with UTF-8/CJK-aware editing. In a TTY this enters raw mode and
|
||||
/// handles keystrokes so Backspace deletes a whole character (not a byte), and
|
||||
/// multi-byte input (汉字/日本語/한글) renders correctly. Non-TTY (piped) input
|
||||
/// falls back to a plain cooked read.
|
||||
fn read_line_edited(prompt: &str) -> Line {
|
||||
let cooked = || -> Line {
|
||||
print!("{prompt}");
|
||||
io::stdout().flush().ok();
|
||||
let mut s = String::new();
|
||||
match io::stdin().read_line(&mut s) {
|
||||
Ok(0) | Err(_) => Line::Eof,
|
||||
Ok(_) => Line::Text(s),
|
||||
}
|
||||
};
|
||||
|
||||
if !io::stdin().is_terminal() {
|
||||
return cooked();
|
||||
}
|
||||
let Some(_raw) = RawMode::enable() else {
|
||||
return cooked();
|
||||
};
|
||||
|
||||
// Single-line editor: on every edit, rewrite the whole line so the terminal
|
||||
// renders correct (incl. double-width CJK) glyphs; \x1b[K clears leftovers.
|
||||
let redraw = |buf: &str| {
|
||||
print!("\r{prompt}{buf}\x1b[K");
|
||||
io::stdout().flush().ok();
|
||||
};
|
||||
|
||||
let mut buf = String::new();
|
||||
redraw(&buf);
|
||||
let mut stdin = io::stdin().lock();
|
||||
let mut byte = [0u8; 1];
|
||||
|
||||
loop {
|
||||
if stdin.read(&mut byte).unwrap_or(0) == 0 {
|
||||
// EOF on the stream.
|
||||
if buf.is_empty() {
|
||||
return Line::Eof;
|
||||
}
|
||||
break;
|
||||
}
|
||||
match byte[0] {
|
||||
b'\r' | b'\n' => {
|
||||
println!();
|
||||
break;
|
||||
}
|
||||
0x7f | 0x08 => {
|
||||
// Backspace: drop one whole char (String::pop is char-aware).
|
||||
buf.pop();
|
||||
redraw(&buf);
|
||||
}
|
||||
0x04 => {
|
||||
// Ctrl-D: EOF only when the line is empty.
|
||||
if buf.is_empty() {
|
||||
return Line::Eof;
|
||||
}
|
||||
}
|
||||
0x1b => {
|
||||
// Escape sequence (arrows, etc.): consume and ignore the 2 bytes
|
||||
// of a typical CSI sequence so they don't land in the buffer.
|
||||
let mut seq = [0u8; 2];
|
||||
let _ = stdin.read(&mut seq);
|
||||
}
|
||||
b if b < 0x20 => { /* other control bytes: ignore */ }
|
||||
b if b < 0x80 => {
|
||||
buf.push(b as char);
|
||||
redraw(&buf);
|
||||
}
|
||||
b => {
|
||||
// UTF-8 multi-byte: read the continuation bytes for this char.
|
||||
let extra = if b >= 0xF0 { 3 } else if b >= 0xE0 { 2 } else { 1 };
|
||||
let mut bytes = vec![b];
|
||||
let mut cont = [0u8; 1];
|
||||
let mut ok = true;
|
||||
for _ in 0..extra {
|
||||
if stdin.read(&mut cont).unwrap_or(0) == 0 {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
bytes.push(cont[0]);
|
||||
}
|
||||
if ok {
|
||||
if let Ok(s) = std::str::from_utf8(&bytes) {
|
||||
buf.push_str(s);
|
||||
redraw(&buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Line::Text(buf)
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let opts = parse_args();
|
||||
|
||||
@@ -65,14 +198,11 @@ fn main() {
|
||||
eprintln!("Commands: /exit, /quit, /clear\n");
|
||||
|
||||
loop {
|
||||
print!("user> ");
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let mut input = String::new();
|
||||
if io::stdin().read_line(&mut input).unwrap() == 0 {
|
||||
break;
|
||||
}
|
||||
let input = input.trim();
|
||||
let line = match read_line_edited("user> ") {
|
||||
Line::Eof => break,
|
||||
Line::Text(s) => s,
|
||||
};
|
||||
let input = line.trim();
|
||||
if input.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user