from_weights_tp shards each rank's weights (column-split q/k/v/gate/up, row-split o/down; replicate norms/embed/lm_head) and the paged forward uses local head counts + AllReduces after o_proj and down_proj. PagedKVCache::new_tp sizes the pool for the rank's local KV heads (KV is sharded too). TP=1 is the identity path. New bench-tp binary runs E2E multi-GPU generation per TP degree. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
18 lines
481 B
TOML
18 lines
481 B
TOML
[package]
|
|
name = "xserv-model"
|
|
version.workspace = true
|
|
edition.workspace = true
|
|
|
|
[dependencies]
|
|
xserv-cuda = { path = "../xserv-cuda" }
|
|
xserv-tensor = { path = "../xserv-tensor" }
|
|
xserv-kernels = { path = "../xserv-kernels" }
|
|
xserv-tokenizer = { path = "../xserv-tokenizer" }
|
|
xserv-distributed = { path = "../xserv-distributed" }
|
|
half.workspace = true
|
|
smallvec.workspace = true
|
|
serde.workspace = true
|
|
serde_json.workspace = true
|
|
safetensors.workspace = true
|
|
rand.workspace = true
|