Load the model's chat_template.jinja (or tokenizer_config.json chat_template field) at startup and render it with minijinja instead of hardcoded per-model prompt builders. Custom Jinja functions: strftime_now (date formatting), raise_exception (template validation errors). Falls back to Qwen3 ChatML template if no Jinja template is found. Removes the hardcoded build_prompt_gpt_oss() — the model's own template now drives prompt formatting, matching llama.cpp's behavior exactly. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
32 lines
664 B
TOML
32 lines
664 B
TOML
[workspace]
|
|
resolver = "2"
|
|
members = [
|
|
"crates/xserv-cuda",
|
|
"crates/xserv-tensor",
|
|
"crates/xserv-kernels",
|
|
"crates/xserv-model",
|
|
"crates/xserv-tokenizer",
|
|
"crates/xserv-server",
|
|
"crates/xserv-distributed",
|
|
]
|
|
|
|
[workspace.package]
|
|
version = "0.1.0"
|
|
edition = "2024"
|
|
license = "MIT"
|
|
|
|
[workspace.dependencies]
|
|
half = "2"
|
|
smallvec = "1"
|
|
libc = "0.2"
|
|
serde = { version = "1", features = ["derive"] }
|
|
serde_json = "1"
|
|
safetensors = "0.5"
|
|
regex = "1"
|
|
tokio = { version = "1", features = ["full"] }
|
|
axum = "0.8"
|
|
uuid = { version = "1", features = ["v4"] }
|
|
tokio-stream = "0.1"
|
|
rand = "0.8"
|
|
minijinja = { version = "2", features = ["builtins"] }
|