Compare commits
3 Commits
8d41123418
...
eaf574cd4e
| Author | SHA1 | Date | |
|---|---|---|---|
| eaf574cd4e | |||
| 663ca9c5b9 | |||
| 84696604e8 |
33
configs/glm5-nvfp4-8xb300.yaml
Normal file
33
configs/glm5-nvfp4-8xb300.yaml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# GLM-5-NVFP4 (nvidia/GLM-5-NVFP4) on 8 x B300 (Blackwell Ultra, 288GB each).
|
||||||
|
# Architecture auto-loaded from HuggingFace config.json.
|
||||||
|
#
|
||||||
|
# FP4 weights: ~744B params * 0.5 bytes = ~372 GB across 8 GPUs.
|
||||||
|
# Total HBM: 8 * 288 GB = 2304 GB. KV budget: ~1900 GB after weights.
|
||||||
|
|
||||||
|
model:
|
||||||
|
config_json: ../models/GLM-5-NVFP4/config.json
|
||||||
|
name: glm-5-nvfp4
|
||||||
|
compute_dtype: fp4 # FP4 weights → selects FP4 tensor core FLOPS
|
||||||
|
dtype_bytes: 1 # FP8 KV cache
|
||||||
|
block_size_tokens: 512
|
||||||
|
|
||||||
|
hardware:
|
||||||
|
type: 8xb300
|
||||||
|
hbm_bytes: 1900.0e9 # KV budget after FP4 weights (~372 GB)
|
||||||
|
dram_bytes: 1.5e12 # ~1.5 TB usable CPU DRAM per node
|
||||||
|
|
||||||
|
cluster:
|
||||||
|
num_instances: 8
|
||||||
|
meta_store:
|
||||||
|
ttl_seconds: 300.0
|
||||||
|
router:
|
||||||
|
mode: prefix_affinity
|
||||||
|
prefix_k: 8
|
||||||
|
load_alpha: 1.0
|
||||||
|
|
||||||
|
sim:
|
||||||
|
trace_path: bailian-traces/glm_coder_blksz_512_040915-040917.jsonl
|
||||||
|
max_requests: null
|
||||||
|
output_dir: runs/glm5_nvfp4_8xb300
|
||||||
|
sample_interval_s: 1.0
|
||||||
|
seed: 42
|
||||||
59
models/GLM-5-NVFP4/config.json
Normal file
59
models/GLM-5-NVFP4/config.json
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"GlmMoeDsaForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"eos_token_id": [
|
||||||
|
154820,
|
||||||
|
154827,
|
||||||
|
154829
|
||||||
|
],
|
||||||
|
"ep_size": 1,
|
||||||
|
"first_k_dense_replace": 3,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"head_dim": 64,
|
||||||
|
"hidden_size": 6144,
|
||||||
|
"index_head_dim": 128,
|
||||||
|
"index_n_heads": 32,
|
||||||
|
"index_topk": 2048,
|
||||||
|
"indexer_rope_interleave": true,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 12288,
|
||||||
|
"kv_lora_rank": 512,
|
||||||
|
"max_position_embeddings": 202752,
|
||||||
|
"moe_intermediate_size": 2048,
|
||||||
|
"moe_layer_freq": 1,
|
||||||
|
"model_type": "glm_moe_dsa",
|
||||||
|
"n_group": 1,
|
||||||
|
"n_routed_experts": 256,
|
||||||
|
"n_shared_experts": 1,
|
||||||
|
"norm_topk_prob": true,
|
||||||
|
"num_attention_heads": 64,
|
||||||
|
"num_experts_per_tok": 8,
|
||||||
|
"num_hidden_layers": 78,
|
||||||
|
"num_key_value_heads": 64,
|
||||||
|
"num_nextn_predict_layers": 1,
|
||||||
|
"pad_token_id": 154820,
|
||||||
|
"pretraining_tp": 1,
|
||||||
|
"q_lora_rank": 2048,
|
||||||
|
"qk_head_dim": 256,
|
||||||
|
"qk_nope_head_dim": 192,
|
||||||
|
"qk_rope_head_dim": 64,
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_interleave": true,
|
||||||
|
"rope_parameters": {
|
||||||
|
"rope_theta": 1000000,
|
||||||
|
"rope_type": "default"
|
||||||
|
},
|
||||||
|
"routed_scaling_factor": 2.5,
|
||||||
|
"scoring_func": "sigmoid",
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"topk_group": 1,
|
||||||
|
"topk_method": "noaux_tc",
|
||||||
|
"transformers_version": "5.0.2.dev0",
|
||||||
|
"use_cache": true,
|
||||||
|
"v_head_dim": 256,
|
||||||
|
"vocab_size": 154880
|
||||||
|
}
|
||||||
@@ -4,6 +4,7 @@
|
|||||||
use crate::cluster::meta_store::MetaStore;
|
use crate::cluster::meta_store::MetaStore;
|
||||||
use crate::config::{Config, ModelConfig};
|
use crate::config::{Config, ModelConfig};
|
||||||
use crate::instance::instance::AdmittedRequest;
|
use crate::instance::instance::AdmittedRequest;
|
||||||
|
use crate::instance::kv_cache::L1Change;
|
||||||
use crate::instance::Instance;
|
use crate::instance::Instance;
|
||||||
use crate::router::{self, RouteDecision, Router};
|
use crate::router::{self, RouteDecision, Router};
|
||||||
use crate::trace::RequestRecord;
|
use crate::trace::RequestRecord;
|
||||||
@@ -53,7 +54,9 @@ impl Cluster {
|
|||||||
/// per-request stats for metrics. Does NOT schedule the BatchTick — the
|
/// per-request stats for metrics. Does NOT schedule the BatchTick — the
|
||||||
/// simulator driver does that based on the returned `ready_at`.
|
/// simulator driver does that based on the returned `ready_at`.
|
||||||
pub fn route_and_admit(&mut self, req: &RequestRecord, now: f64) -> AdmissionStats {
|
pub fn route_and_admit(&mut self, req: &RequestRecord, now: f64) -> AdmissionStats {
|
||||||
let decision = self.router.route(req, &self.instances, &self.meta_store, now);
|
let decision = self
|
||||||
|
.router
|
||||||
|
.route(req, &self.instances, &self.meta_store, now);
|
||||||
let inst_id = decision.chosen;
|
let inst_id = decision.chosen;
|
||||||
let probe_overhead_s = decision.probe_overhead_s;
|
let probe_overhead_s = decision.probe_overhead_s;
|
||||||
|
|
||||||
@@ -68,19 +71,18 @@ impl Cluster {
|
|||||||
|
|
||||||
// 2. L1 lookup on the remaining suffix.
|
// 2. L1 lookup on the remaining suffix.
|
||||||
let suffix_after_l0 = &req.hash_ids[l0_hits as usize..];
|
let suffix_after_l0 = &req.hash_ids[l0_hits as usize..];
|
||||||
let l1_hits = inst.cache.l1.longest_prefix(suffix_after_l0) as u32;
|
let l1_hits = inst.cache.l1.longest_prefix_peek(suffix_after_l0) as u32;
|
||||||
// L1->L0 transfer cost
|
// L1->L0 transfer cost
|
||||||
let l1_bytes = (l1_hits as u64) * self.kv_block_bytes;
|
let l1_bytes = (l1_hits as u64) * self.kv_block_bytes;
|
||||||
let mut t = effective_now;
|
let mut t = effective_now;
|
||||||
|
let mut l1_changes = Vec::new();
|
||||||
if l1_hits > 0 {
|
if l1_hits > 0 {
|
||||||
t = inst.links.pcie.reserve(t, l1_bytes);
|
t = inst.links.pcie.reserve(t, l1_bytes);
|
||||||
// Promote those blocks into L0
|
l1_changes = inst
|
||||||
let mut evicted = Vec::new();
|
.cache
|
||||||
inst.cache.l0.insert_blocks(
|
.promote_l1_blocks_to_l0(&suffix_after_l0[..l1_hits as usize]);
|
||||||
&suffix_after_l0[..l1_hits as usize],
|
|
||||||
&mut evicted,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
Self::apply_l1_changes(&mut self.meta_store, inst_id, now, &l1_changes);
|
||||||
|
|
||||||
// 3. Remote v6d lookup for the still-remaining suffix.
|
// 3. Remote v6d lookup for the still-remaining suffix.
|
||||||
let suffix_after_l1 = &suffix_after_l0[l1_hits as usize..];
|
let suffix_after_l1 = &suffix_after_l0[l1_hits as usize..];
|
||||||
@@ -98,20 +100,14 @@ impl Cluster {
|
|||||||
}
|
}
|
||||||
let remote_bytes = (remote_hit_blocks as u64) * self.kv_block_bytes;
|
let remote_bytes = (remote_hit_blocks as u64) * self.kv_block_bytes;
|
||||||
if remote_hit_blocks > 0 {
|
if remote_hit_blocks > 0 {
|
||||||
// RDMA from peer host -> local DRAM, then PCIe -> GPU
|
let pulled = &suffix_after_l1[..remote_hit_blocks as usize];
|
||||||
|
let l1_changes = {
|
||||||
let inst = &mut self.instances[inst_id as usize];
|
let inst = &mut self.instances[inst_id as usize];
|
||||||
t = inst.links.rdma.reserve(t, remote_bytes);
|
t = inst.links.rdma.reserve(t, remote_bytes);
|
||||||
t = inst.links.pcie.reserve(t, remote_bytes);
|
t = inst.links.pcie.reserve(t, remote_bytes);
|
||||||
// Insert into local L1 (occupies LRU space) AND into L0
|
inst.cache.fetch_remote_blocks_to_l0(pulled)
|
||||||
let pulled = &suffix_after_l1[..remote_hit_blocks as usize];
|
};
|
||||||
let mut evicted_l1 = Vec::new();
|
Self::apply_l1_changes(&mut self.meta_store, inst_id, now, &l1_changes);
|
||||||
inst.cache.l1.insert_blocks(pulled, &mut evicted_l1);
|
|
||||||
let mut evicted_l0 = Vec::new();
|
|
||||||
inst.cache.l0.insert_blocks(pulled, &mut evicted_l0);
|
|
||||||
// The local instance now also owns these blocks - update meta_store.
|
|
||||||
for &h in pulled {
|
|
||||||
self.meta_store.insert(h, inst_id, now);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4. Miss = remaining tokens to prefill from scratch.
|
// 4. Miss = remaining tokens to prefill from scratch.
|
||||||
@@ -119,20 +115,14 @@ impl Cluster {
|
|||||||
let miss_tokens = miss_blocks * self.block_size_tokens;
|
let miss_tokens = miss_blocks * self.block_size_tokens;
|
||||||
|
|
||||||
// The newly-prefilled blocks (after the request runs) are inserted
|
// The newly-prefilled blocks (after the request runs) are inserted
|
||||||
// into L0 here, and into L1 / meta_store via async writeback. Doing
|
// into L0 here. Only later L0 evictions become remotely visible by
|
||||||
// this at admission time is OK because we're tracking presence, not
|
// landing in L1 and being published to the meta store.
|
||||||
// actually moving bytes — the writeback latency is hidden behind
|
|
||||||
// request execution and we don't model meta_store inconsistency
|
|
||||||
// window beyond the TTL itself.
|
|
||||||
let inst = &mut self.instances[inst_id as usize];
|
|
||||||
let new_input_blocks = &req.hash_ids[(l0_hits + l1_hits + remote_hit_blocks) as usize..];
|
let new_input_blocks = &req.hash_ids[(l0_hits + l1_hits + remote_hit_blocks) as usize..];
|
||||||
let mut evicted_l0 = Vec::new();
|
let l1_changes = {
|
||||||
inst.cache.l0.insert_blocks(new_input_blocks, &mut evicted_l0);
|
let inst = &mut self.instances[inst_id as usize];
|
||||||
let mut evicted_l1 = Vec::new();
|
inst.cache.insert_blocks_into_l0(new_input_blocks)
|
||||||
inst.cache.l1.insert_blocks(new_input_blocks, &mut evicted_l1);
|
};
|
||||||
for &h in new_input_blocks {
|
Self::apply_l1_changes(&mut self.meta_store, inst_id, now, &l1_changes);
|
||||||
self.meta_store.insert(h, inst_id, now);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 5. Reserve KV slots for this request's prefill residency.
|
// 5. Reserve KV slots for this request's prefill residency.
|
||||||
// PD disaggregation: decode runs elsewhere, so only the input
|
// PD disaggregation: decode runs elsewhere, so only the input
|
||||||
@@ -145,6 +135,7 @@ impl Cluster {
|
|||||||
prefill_tokens_remaining: miss_tokens,
|
prefill_tokens_remaining: miss_tokens,
|
||||||
reserved_blocks,
|
reserved_blocks,
|
||||||
};
|
};
|
||||||
|
let inst = &mut self.instances[inst_id as usize];
|
||||||
inst.admit(admitted);
|
inst.admit(admitted);
|
||||||
|
|
||||||
let pcie_bytes = l1_bytes + remote_bytes;
|
let pcie_bytes = l1_bytes + remote_bytes;
|
||||||
@@ -164,4 +155,18 @@ impl Cluster {
|
|||||||
decision,
|
decision,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn apply_l1_changes(
|
||||||
|
meta_store: &mut MetaStore,
|
||||||
|
inst_id: InstanceId,
|
||||||
|
now: f64,
|
||||||
|
changes: &[L1Change],
|
||||||
|
) {
|
||||||
|
for change in changes {
|
||||||
|
match *change {
|
||||||
|
L1Change::Added(hash) => meta_store.insert(hash, inst_id, now),
|
||||||
|
L1Change::Removed(hash) => meta_store.remove(hash, inst_id),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -116,6 +116,21 @@ impl MetaStore {
|
|||||||
scores
|
scores
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Remove `instance`'s entry for `block_hash` (e.g. after L1 eviction).
|
||||||
|
///
|
||||||
|
/// The meta-store must reflect **L1 (DRAM) presence only**, because remote
|
||||||
|
/// RDMA fetch can only reach CPU DRAM, never GPU HBM. Whenever the L1
|
||||||
|
/// tier evicts a block, the caller must invoke this so the meta-store
|
||||||
|
/// stops advertising the block as remotely available on this instance.
|
||||||
|
pub fn remove(&mut self, block_hash: u64, instance: InstanceId) {
|
||||||
|
if let Some(bucket) = self.map.get_mut(&block_hash) {
|
||||||
|
bucket.retain(|e| e.instance != instance);
|
||||||
|
if bucket.is_empty() {
|
||||||
|
self.map.remove(&block_hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Lookup which (alive) instances claim to hold a given block.
|
/// Lookup which (alive) instances claim to hold a given block.
|
||||||
pub fn instances_for(&self, hash: u64, now: f64) -> SmallVec<[InstanceId; 4]> {
|
pub fn instances_for(&self, hash: u64, now: f64) -> SmallVec<[InstanceId; 4]> {
|
||||||
let mut out = SmallVec::new();
|
let mut out = SmallVec::new();
|
||||||
@@ -149,6 +164,34 @@ mod tests {
|
|||||||
assert_eq!(s[2], 0);
|
assert_eq!(s[2], 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn remove_cleans_up() {
|
||||||
|
let mut m = MetaStore::new(60.0);
|
||||||
|
m.insert(10, 0, 0.0);
|
||||||
|
m.insert(10, 1, 0.0);
|
||||||
|
m.insert(11, 0, 0.0);
|
||||||
|
|
||||||
|
// instance 0 has both blocks, instance 1 has block 10 only
|
||||||
|
let owners = m.instances_for(10, 0.5);
|
||||||
|
assert_eq!(owners.len(), 2);
|
||||||
|
|
||||||
|
// Remove instance 0's entry for block 10
|
||||||
|
m.remove(10, 0);
|
||||||
|
let owners = m.instances_for(10, 0.5);
|
||||||
|
assert_eq!(owners.len(), 1);
|
||||||
|
assert_eq!(owners[0], 1);
|
||||||
|
|
||||||
|
// Instance 0 still owns block 11
|
||||||
|
let owners = m.instances_for(11, 0.5);
|
||||||
|
assert_eq!(owners.len(), 1);
|
||||||
|
assert_eq!(owners[0], 0);
|
||||||
|
|
||||||
|
// Remove last owner of a block -> entry fully cleaned
|
||||||
|
m.remove(10, 1);
|
||||||
|
let owners = m.instances_for(10, 0.5);
|
||||||
|
assert!(owners.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn ttl_expiry() {
|
fn ttl_expiry() {
|
||||||
let mut m = MetaStore::new(1.0);
|
let mut m = MetaStore::new(1.0);
|
||||||
|
|||||||
@@ -57,6 +57,13 @@ pub struct ModelConfig {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub attention: Option<AttentionConfig>,
|
pub attention: Option<AttentionConfig>,
|
||||||
|
|
||||||
|
/// Compute / weight precision: `"bf16"` (default), `"fp8"`, or `"fp4"`.
|
||||||
|
/// Controls which hardware FLOPS tier to use (`gpu_fp4_flops`, etc.) and
|
||||||
|
/// the weight-bytes-per-parameter for the memory-bound roofline check.
|
||||||
|
/// Independent of `dtype_bytes`, which sizes the KV cache.
|
||||||
|
#[serde(default)]
|
||||||
|
pub compute_dtype: Option<String>,
|
||||||
|
|
||||||
// -- Legacy manual coefficients (used when hidden_size is absent) ---------
|
// -- Legacy manual coefficients (used when hidden_size is absent) ---------
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub flops_per_token_prefill: Option<f64>,
|
pub flops_per_token_prefill: Option<f64>,
|
||||||
@@ -79,6 +86,20 @@ impl ModelConfig {
|
|||||||
self.hidden_size.is_some()
|
self.hidden_size.is_some()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Bytes per parameter for weight storage, derived from `compute_dtype`.
|
||||||
|
///
|
||||||
|
/// - `"fp4"` → 0.5
|
||||||
|
/// - `"fp8"` → 1.0
|
||||||
|
/// - `"bf16"` / absent → `dtype_bytes` (backward-compatible)
|
||||||
|
pub fn weight_dtype_bytes(&self) -> f64 {
|
||||||
|
match self.compute_dtype.as_deref() {
|
||||||
|
Some("fp4") => 0.5,
|
||||||
|
Some("fp8") => 1.0,
|
||||||
|
Some("bf16") => 2.0,
|
||||||
|
_ => self.dtype_bytes as f64, // backward compat
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Bytes of KV cache per block.
|
/// Bytes of KV cache per block.
|
||||||
///
|
///
|
||||||
/// For standard / GQA: `2 * L * kv_heads * head_dim * dtype * block_tokens`
|
/// For standard / GQA: `2 * L * kv_heads * head_dim * dtype * block_tokens`
|
||||||
@@ -147,7 +168,14 @@ pub enum AttentionConfig {
|
|||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct HardwareConfig {
|
pub struct HardwareConfig {
|
||||||
|
/// Active GPU FLOPS (selected from bf16/fp8/fp4 based on compute_dtype).
|
||||||
pub gpu_flops: f64,
|
pub gpu_flops: f64,
|
||||||
|
/// FP8 tensor core FLOPS (0 if not populated by preset).
|
||||||
|
#[serde(default)]
|
||||||
|
pub gpu_fp8_flops: f64,
|
||||||
|
/// FP4 tensor core FLOPS (0 if not populated by preset).
|
||||||
|
#[serde(default)]
|
||||||
|
pub gpu_fp4_flops: f64,
|
||||||
pub gpu_mem_bw: f64,
|
pub gpu_mem_bw: f64,
|
||||||
pub hbm_bytes: f64,
|
pub hbm_bytes: f64,
|
||||||
pub dram_bytes: f64,
|
pub dram_bytes: f64,
|
||||||
@@ -368,6 +396,8 @@ struct RawModelConfig {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
bytes_per_token_prefill: Option<f64>,
|
bytes_per_token_prefill: Option<f64>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
|
compute_dtype: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
flops_per_token_decode: Option<f64>,
|
flops_per_token_decode: Option<f64>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
bytes_per_token_decode: Option<f64>,
|
bytes_per_token_decode: Option<f64>,
|
||||||
@@ -407,12 +437,25 @@ struct RawHardwareConfig {
|
|||||||
|
|
||||||
impl RawConfig {
|
impl RawConfig {
|
||||||
fn resolve(self, yaml_dir: &Path) -> Result<Config> {
|
fn resolve(self, yaml_dir: &Path) -> Result<Config> {
|
||||||
Ok(Config {
|
let model = self.model.resolve(yaml_dir)?;
|
||||||
model: self.model.resolve(yaml_dir)?,
|
let user_set_gpu_flops = self.hardware.gpu_flops.is_some();
|
||||||
hardware: self.hardware.resolve()?,
|
let mut hardware = self.hardware.resolve()?;
|
||||||
cluster: self.cluster,
|
|
||||||
sim: self.sim,
|
// Auto-select gpu_flops tier based on model's compute_dtype,
|
||||||
})
|
// but only if the user did NOT explicitly override gpu_flops in YAML.
|
||||||
|
if !user_set_gpu_flops {
|
||||||
|
match model.compute_dtype.as_deref() {
|
||||||
|
Some("fp4") if hardware.gpu_fp4_flops > 0.0 => {
|
||||||
|
hardware.gpu_flops = hardware.gpu_fp4_flops;
|
||||||
|
}
|
||||||
|
Some("fp8") if hardware.gpu_fp8_flops > 0.0 => {
|
||||||
|
hardware.gpu_flops = hardware.gpu_fp8_flops;
|
||||||
|
}
|
||||||
|
_ => {} // keep BF16
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Config { model, hardware, cluster: self.cluster, sim: self.sim })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -446,6 +489,7 @@ impl RawModelConfig {
|
|||||||
if let Some(v) = self.flops_per_token_prefill { m.flops_per_token_prefill = Some(v); }
|
if let Some(v) = self.flops_per_token_prefill { m.flops_per_token_prefill = Some(v); }
|
||||||
if let Some(v) = self.attn_quadratic_coeff { m.attn_quadratic_coeff = Some(v); }
|
if let Some(v) = self.attn_quadratic_coeff { m.attn_quadratic_coeff = Some(v); }
|
||||||
if let Some(v) = self.bytes_per_token_prefill { m.bytes_per_token_prefill = Some(v); }
|
if let Some(v) = self.bytes_per_token_prefill { m.bytes_per_token_prefill = Some(v); }
|
||||||
|
if self.compute_dtype.is_some() { m.compute_dtype = self.compute_dtype; }
|
||||||
if let Some(v) = self.flops_per_token_decode { m.flops_per_token_decode = Some(v); }
|
if let Some(v) = self.flops_per_token_decode { m.flops_per_token_decode = Some(v); }
|
||||||
if let Some(v) = self.bytes_per_token_decode { m.bytes_per_token_decode = Some(v); }
|
if let Some(v) = self.bytes_per_token_decode { m.bytes_per_token_decode = Some(v); }
|
||||||
|
|
||||||
@@ -476,6 +520,8 @@ impl RawHardwareConfig {
|
|||||||
} else {
|
} else {
|
||||||
HardwareConfig {
|
HardwareConfig {
|
||||||
gpu_flops: 0.0,
|
gpu_flops: 0.0,
|
||||||
|
gpu_fp8_flops: 0.0,
|
||||||
|
gpu_fp4_flops: 0.0,
|
||||||
gpu_mem_bw: 0.0,
|
gpu_mem_bw: 0.0,
|
||||||
hbm_bytes: 0.0,
|
hbm_bytes: 0.0,
|
||||||
dram_bytes: 0.0,
|
dram_bytes: 0.0,
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ pub const AVAILABLE: &[&str] = &[
|
|||||||
"a100-80gb",
|
"a100-80gb",
|
||||||
"a100-40gb",
|
"a100-40gb",
|
||||||
"b200",
|
"b200",
|
||||||
|
"b300",
|
||||||
"2xh100",
|
"2xh100",
|
||||||
"4xh100",
|
"4xh100",
|
||||||
"8xh100",
|
"8xh100",
|
||||||
@@ -32,6 +33,9 @@ pub const AVAILABLE: &[&str] = &[
|
|||||||
"2xb200",
|
"2xb200",
|
||||||
"4xb200",
|
"4xb200",
|
||||||
"8xb200",
|
"8xb200",
|
||||||
|
"2xb300",
|
||||||
|
"4xb300",
|
||||||
|
"8xb300",
|
||||||
];
|
];
|
||||||
|
|
||||||
/// Resolve a hardware preset by name.
|
/// Resolve a hardware preset by name.
|
||||||
@@ -48,6 +52,7 @@ pub fn resolve(name: &str) -> Option<HardwareConfig> {
|
|||||||
"a10080gb" | "a100" => Some(make_config(count, &A100_80GB)),
|
"a10080gb" | "a100" => Some(make_config(count, &A100_80GB)),
|
||||||
"a10040gb" => Some(make_config(count, &A100_40GB)),
|
"a10040gb" => Some(make_config(count, &A100_40GB)),
|
||||||
"b200" => Some(make_config(count, &B200)),
|
"b200" => Some(make_config(count, &B200)),
|
||||||
|
"b300" => Some(make_config(count, &B300)),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -73,14 +78,18 @@ fn parse_count_gpu(s: &str) -> (u32, String) {
|
|||||||
// -- Per-GPU base specs (single die, BF16 dense) -----------------------------
|
// -- Per-GPU base specs (single die, BF16 dense) -----------------------------
|
||||||
|
|
||||||
struct GpuBase {
|
struct GpuBase {
|
||||||
flops: f64, // BF16 dense TFLOPS
|
flops: f64, // BF16 dense FLOPS
|
||||||
|
fp8_flops: f64, // FP8 dense FLOPS (0 = not supported)
|
||||||
|
fp4_flops: f64, // FP4 dense FLOPS (0 = not supported)
|
||||||
mem_bw: f64, // HBM bandwidth (B/s)
|
mem_bw: f64, // HBM bandwidth (B/s)
|
||||||
hbm: f64, // Total HBM (bytes)
|
hbm: f64, // Total HBM (bytes)
|
||||||
pcie_gen: u32, // PCIe generation (4/5/6)
|
pcie_gen: u32, // PCIe generation (4/5/6)
|
||||||
}
|
}
|
||||||
|
|
||||||
const H100: GpuBase = GpuBase {
|
const H100: GpuBase = GpuBase {
|
||||||
flops: 9.89e14, // 989 TFLOPS BF16
|
flops: 9.89e14, // 989 TFLOPS BF16 dense
|
||||||
|
fp8_flops: 1.979e15, // 1979 TFLOPS FP8 dense
|
||||||
|
fp4_flops: 0.0, // not supported
|
||||||
mem_bw: 3.35e12, // 3.35 TB/s HBM3
|
mem_bw: 3.35e12, // 3.35 TB/s HBM3
|
||||||
hbm: 80.0e9, // 80 GB
|
hbm: 80.0e9, // 80 GB
|
||||||
pcie_gen: 5,
|
pcie_gen: 5,
|
||||||
@@ -88,6 +97,8 @@ const H100: GpuBase = GpuBase {
|
|||||||
|
|
||||||
const H800: GpuBase = GpuBase {
|
const H800: GpuBase = GpuBase {
|
||||||
flops: 9.89e14, // same die as H100
|
flops: 9.89e14, // same die as H100
|
||||||
|
fp8_flops: 1.979e15,
|
||||||
|
fp4_flops: 0.0,
|
||||||
mem_bw: 3.35e12, // 3.35 TB/s HBM3
|
mem_bw: 3.35e12, // 3.35 TB/s HBM3
|
||||||
hbm: 80.0e9, // 80 GB
|
hbm: 80.0e9, // 80 GB
|
||||||
pcie_gen: 5,
|
pcie_gen: 5,
|
||||||
@@ -95,6 +106,8 @@ const H800: GpuBase = GpuBase {
|
|||||||
|
|
||||||
const H20: GpuBase = GpuBase {
|
const H20: GpuBase = GpuBase {
|
||||||
flops: 1.48e14, // 148 TFLOPS BF16 (China-export Hopper)
|
flops: 1.48e14, // 148 TFLOPS BF16 (China-export Hopper)
|
||||||
|
fp8_flops: 2.96e14, // 296 TFLOPS FP8
|
||||||
|
fp4_flops: 0.0, // not supported
|
||||||
mem_bw: 4.0e12, // 4.0 TB/s HBM3
|
mem_bw: 4.0e12, // 4.0 TB/s HBM3
|
||||||
hbm: 96.0e9, // 96 GB
|
hbm: 96.0e9, // 96 GB
|
||||||
pcie_gen: 5,
|
pcie_gen: 5,
|
||||||
@@ -102,6 +115,8 @@ const H20: GpuBase = GpuBase {
|
|||||||
|
|
||||||
const A100_80GB: GpuBase = GpuBase {
|
const A100_80GB: GpuBase = GpuBase {
|
||||||
flops: 3.12e14, // 312 TFLOPS BF16
|
flops: 3.12e14, // 312 TFLOPS BF16
|
||||||
|
fp8_flops: 0.0, // A100 has no FP8 tensor cores
|
||||||
|
fp4_flops: 0.0,
|
||||||
mem_bw: 2.0e12, // 2.0 TB/s HBM2e
|
mem_bw: 2.0e12, // 2.0 TB/s HBM2e
|
||||||
hbm: 80.0e9, // 80 GB
|
hbm: 80.0e9, // 80 GB
|
||||||
pcie_gen: 4,
|
pcie_gen: 4,
|
||||||
@@ -109,18 +124,33 @@ const A100_80GB: GpuBase = GpuBase {
|
|||||||
|
|
||||||
const A100_40GB: GpuBase = GpuBase {
|
const A100_40GB: GpuBase = GpuBase {
|
||||||
flops: 3.12e14, // 312 TFLOPS BF16
|
flops: 3.12e14, // 312 TFLOPS BF16
|
||||||
|
fp8_flops: 0.0,
|
||||||
|
fp4_flops: 0.0,
|
||||||
mem_bw: 1.555e12, // 1.555 TB/s HBM2e
|
mem_bw: 1.555e12, // 1.555 TB/s HBM2e
|
||||||
hbm: 40.0e9, // 40 GB
|
hbm: 40.0e9, // 40 GB
|
||||||
pcie_gen: 4,
|
pcie_gen: 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// DGX B200 (8 GPU) specs: BF16 18 PFLOPS, FP8 36 PFLOPS, FP4 72 PFLOPS (dense)
|
||||||
const B200: GpuBase = GpuBase {
|
const B200: GpuBase = GpuBase {
|
||||||
flops: 2.25e15, // 2250 TFLOPS BF16
|
flops: 2.25e15, // 2250 TFLOPS BF16 dense
|
||||||
|
fp8_flops: 4.5e15, // 4500 TFLOPS FP8 dense
|
||||||
|
fp4_flops: 9.0e15, // 9000 TFLOPS FP4 dense
|
||||||
mem_bw: 8.0e12, // 8.0 TB/s HBM3e
|
mem_bw: 8.0e12, // 8.0 TB/s HBM3e
|
||||||
hbm: 192.0e9, // 192 GB
|
hbm: 192.0e9, // 192 GB
|
||||||
pcie_gen: 6,
|
pcie_gen: 6,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// DGX B300 (8 GPU) specs: BF16 18 PFLOPS, FP8 ~54 PFLOPS, FP4 108 PFLOPS (dense)
|
||||||
|
const B300: GpuBase = GpuBase {
|
||||||
|
flops: 2.25e15, // 2250 TFLOPS BF16 dense (same GB202 die as B200)
|
||||||
|
fp8_flops: 6.75e15, // 6750 TFLOPS FP8 dense (estimated from FP4/2)
|
||||||
|
fp4_flops: 13.5e15, // 13500 TFLOPS FP4 dense (Blackwell Ultra enhanced)
|
||||||
|
mem_bw: 12.0e12, // 12 TB/s HBM3e 12-Hi
|
||||||
|
hbm: 288.0e9, // 288 GB HBM3e 12-Hi
|
||||||
|
pcie_gen: 6,
|
||||||
|
};
|
||||||
|
|
||||||
/// Build a [`HardwareConfig`] from a base GPU spec × TP count.
|
/// Build a [`HardwareConfig`] from a base GPU spec × TP count.
|
||||||
///
|
///
|
||||||
/// Compute, HBM bandwidth, and HBM capacity scale linearly with `n`.
|
/// Compute, HBM bandwidth, and HBM capacity scale linearly with `n`.
|
||||||
@@ -153,6 +183,8 @@ fn make_config(n: u32, base: &GpuBase) -> HardwareConfig {
|
|||||||
|
|
||||||
HardwareConfig {
|
HardwareConfig {
|
||||||
gpu_flops: base.flops * f,
|
gpu_flops: base.flops * f,
|
||||||
|
gpu_fp8_flops: base.fp8_flops * f,
|
||||||
|
gpu_fp4_flops: base.fp4_flops * f,
|
||||||
gpu_mem_bw: base.mem_bw * f,
|
gpu_mem_bw: base.mem_bw * f,
|
||||||
hbm_bytes: base.hbm * f,
|
hbm_bytes: base.hbm * f,
|
||||||
dram_bytes: dram,
|
dram_bytes: dram,
|
||||||
|
|||||||
@@ -75,7 +75,8 @@ impl ComputeModel {
|
|||||||
let n_kv = model.num_kv_heads as f64;
|
let n_kv = model.num_kv_heads as f64;
|
||||||
let hd = model.head_dim as f64;
|
let hd = model.head_dim as f64;
|
||||||
let inter = model.intermediate_size.unwrap_or(0) as f64;
|
let inter = model.intermediate_size.unwrap_or(0) as f64;
|
||||||
let dtype = model.dtype_bytes as f64;
|
// Weight dtype for memory-bound check (separate from KV cache dtype).
|
||||||
|
let wdtype = model.weight_dtype_bytes();
|
||||||
|
|
||||||
// --- Attention linear FLOPs/token/layer ---
|
// --- Attention linear FLOPs/token/layer ---
|
||||||
let attn_linear = if let Some(mla) = &model.mla {
|
let attn_linear = if let Some(mla) = &model.mla {
|
||||||
@@ -134,18 +135,18 @@ impl ComputeModel {
|
|||||||
(h * qlr + qlr * n_heads * qk_hd
|
(h * qlr + qlr * n_heads * qk_hd
|
||||||
+ h * (kvlr + qk_rd)
|
+ h * (kvlr + qk_rd)
|
||||||
+ n_heads * vhd * h)
|
+ n_heads * vhd * h)
|
||||||
* dtype
|
* wdtype
|
||||||
} else {
|
} else {
|
||||||
((n_heads + 2.0 * n_kv) * hd * h + n_heads * hd * h) * dtype
|
((n_heads + 2.0 * n_kv) * hd * h + n_heads * hd * h) * wdtype
|
||||||
};
|
};
|
||||||
let mlp_wt = if let Some(moe) = &model.moe {
|
let mlp_wt = if let Some(moe) = &model.moe {
|
||||||
let expert_inter = moe.expert_intermediate_size
|
let expert_inter = moe.expert_intermediate_size
|
||||||
.unwrap_or(model.intermediate_size.unwrap_or(0)) as f64;
|
.unwrap_or(model.intermediate_size.unwrap_or(0)) as f64;
|
||||||
let active = moe.num_active_experts as f64;
|
let active = moe.num_active_experts as f64;
|
||||||
let shared = moe.num_shared_experts as f64;
|
let shared = moe.num_shared_experts as f64;
|
||||||
(active * 3.0 * h * expert_inter + shared * 3.0 * h * inter) * dtype
|
(active * 3.0 * h * expert_inter + shared * 3.0 * h * inter) * wdtype
|
||||||
} else {
|
} else {
|
||||||
3.0 * h * inter * dtype
|
3.0 * h * inter * wdtype
|
||||||
};
|
};
|
||||||
let weight_bytes = attn_wt + mlp_wt;
|
let weight_bytes = attn_wt + mlp_wt;
|
||||||
|
|
||||||
@@ -385,6 +386,8 @@ mod tests {
|
|||||||
};
|
};
|
||||||
let hw = HardwareConfig {
|
let hw = HardwareConfig {
|
||||||
gpu_flops: 1e14,
|
gpu_flops: 1e14,
|
||||||
|
gpu_fp8_flops: 0.0,
|
||||||
|
gpu_fp4_flops: 0.0,
|
||||||
gpu_mem_bw: 1e12,
|
gpu_mem_bw: 1e12,
|
||||||
hbm_bytes: 1e9,
|
hbm_bytes: 1e9,
|
||||||
dram_bytes: 4e9,
|
dram_bytes: 4e9,
|
||||||
|
|||||||
@@ -10,6 +10,12 @@
|
|||||||
|
|
||||||
use ahash::AHashMap;
|
use ahash::AHashMap;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum L1Change {
|
||||||
|
Added(u64),
|
||||||
|
Removed(u64),
|
||||||
|
}
|
||||||
|
|
||||||
/// Doubly-linked-list-backed LRU keyed by block hash.
|
/// Doubly-linked-list-backed LRU keyed by block hash.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct LruBlocks {
|
pub struct LruBlocks {
|
||||||
@@ -56,6 +62,16 @@ impl LruBlocks {
|
|||||||
self.map.contains_key(&key)
|
self.map.contains_key(&key)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn remove(&mut self, key: u64) -> bool {
|
||||||
|
if let Some(idx) = self.map.remove(&key) {
|
||||||
|
self.detach(idx);
|
||||||
|
self.free.push(idx);
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Touch (move to MRU) if present. Returns whether the key was present.
|
/// Touch (move to MRU) if present. Returns whether the key was present.
|
||||||
pub fn touch(&mut self, key: u64) -> bool {
|
pub fn touch(&mut self, key: u64) -> bool {
|
||||||
if let Some(&idx) = self.map.get(&key) {
|
if let Some(&idx) = self.map.get(&key) {
|
||||||
@@ -70,31 +86,45 @@ impl LruBlocks {
|
|||||||
/// existing block just touches it.
|
/// existing block just touches it.
|
||||||
pub fn insert_blocks(&mut self, hashes: &[u64], evicted_out: &mut Vec<u64>) {
|
pub fn insert_blocks(&mut self, hashes: &[u64], evicted_out: &mut Vec<u64>) {
|
||||||
for &h in hashes {
|
for &h in hashes {
|
||||||
if self.touch(h) {
|
if let Some(evicted) = self.insert_block(h) {
|
||||||
continue;
|
evicted_out.push(evicted);
|
||||||
}
|
}
|
||||||
// need to make room?
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert_block(&mut self, key: u64) -> Option<u64> {
|
||||||
|
if self.touch(key) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let mut evicted = None;
|
||||||
if self.map.len() == self.capacity {
|
if self.map.len() == self.capacity {
|
||||||
if let Some(tail_idx) = self.tail {
|
if let Some(tail_idx) = self.tail {
|
||||||
let tail_key = self.nodes[tail_idx].key;
|
let tail_key = self.nodes[tail_idx].key;
|
||||||
self.detach(tail_idx);
|
self.detach(tail_idx);
|
||||||
self.map.remove(&tail_key);
|
self.map.remove(&tail_key);
|
||||||
self.free.push(tail_idx);
|
self.free.push(tail_idx);
|
||||||
evicted_out.push(tail_key);
|
evicted = Some(tail_key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// allocate node
|
|
||||||
let idx = if let Some(i) = self.free.pop() {
|
let idx = if let Some(i) = self.free.pop() {
|
||||||
self.nodes[i] = Node { key: h, prev: None, next: None };
|
self.nodes[i] = Node {
|
||||||
|
key,
|
||||||
|
prev: None,
|
||||||
|
next: None,
|
||||||
|
};
|
||||||
i
|
i
|
||||||
} else {
|
} else {
|
||||||
let i = self.nodes.len();
|
let i = self.nodes.len();
|
||||||
self.nodes.push(Node { key: h, prev: None, next: None });
|
self.nodes.push(Node {
|
||||||
|
key,
|
||||||
|
prev: None,
|
||||||
|
next: None,
|
||||||
|
});
|
||||||
i
|
i
|
||||||
};
|
};
|
||||||
self.map.insert(h, idx);
|
self.map.insert(key, idx);
|
||||||
self.attach_to_head(idx);
|
self.attach_to_head(idx);
|
||||||
}
|
evicted
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Longest leading prefix of `hashes` present; touches the matched blocks.
|
/// Longest leading prefix of `hashes` present; touches the matched blocks.
|
||||||
@@ -178,6 +208,68 @@ impl TwoTierCache {
|
|||||||
l1: LruBlocks::new(l1_cap),
|
l1: LruBlocks::new(l1_cap),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn insert_blocks_into_l0(&mut self, hashes: &[u64]) -> Vec<L1Change> {
|
||||||
|
let mut changes = Vec::new();
|
||||||
|
for &h in hashes {
|
||||||
|
self.insert_block_into_l0(h, &mut changes);
|
||||||
|
}
|
||||||
|
changes
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn promote_l1_blocks_to_l0(&mut self, hashes: &[u64]) -> Vec<L1Change> {
|
||||||
|
let mut changes = Vec::new();
|
||||||
|
for &h in hashes {
|
||||||
|
if self.l1.remove(h) {
|
||||||
|
changes.push(L1Change::Removed(h));
|
||||||
|
}
|
||||||
|
self.insert_block_into_l0(h, &mut changes);
|
||||||
|
}
|
||||||
|
changes
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn fetch_remote_blocks_to_l0(&mut self, hashes: &[u64]) -> Vec<L1Change> {
|
||||||
|
let mut changes = Vec::new();
|
||||||
|
for &h in hashes {
|
||||||
|
self.stage_remote_block_in_l1(h, &mut changes);
|
||||||
|
let removed = self.l1.remove(h);
|
||||||
|
debug_assert!(removed, "staged remote block must be present in l1");
|
||||||
|
self.insert_block_into_l0(h, &mut changes);
|
||||||
|
}
|
||||||
|
changes
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_block_into_l0(&mut self, hash: u64, changes: &mut Vec<L1Change>) {
|
||||||
|
if self.l0.touch(hash) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if self.l1.remove(hash) {
|
||||||
|
changes.push(L1Change::Removed(hash));
|
||||||
|
}
|
||||||
|
if let Some(evicted_l0) = self.l0.insert_block(hash) {
|
||||||
|
self.demote_into_l1(evicted_l0, changes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn stage_remote_block_in_l1(&mut self, hash: u64, changes: &mut Vec<L1Change>) {
|
||||||
|
if self.l0.contains(hash) || self.l1.contains(hash) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if let Some(evicted_l1) = self.l1.insert_block(hash) {
|
||||||
|
changes.push(L1Change::Removed(evicted_l1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn demote_into_l1(&mut self, hash: u64, changes: &mut Vec<L1Change>) {
|
||||||
|
debug_assert!(!self.l0.contains(hash));
|
||||||
|
if self.l1.touch(hash) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if let Some(evicted_l1) = self.l1.insert_block(hash) {
|
||||||
|
changes.push(L1Change::Removed(evicted_l1));
|
||||||
|
}
|
||||||
|
changes.push(L1Change::Added(hash));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -223,4 +315,61 @@ mod tests {
|
|||||||
c.insert_blocks(&[4], &mut ev);
|
c.insert_blocks(&[4], &mut ev);
|
||||||
assert_eq!(ev, vec![2]);
|
assert_eq!(ev, vec![2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn two_tier_cache_demotes_l0_evictions_into_l1() {
|
||||||
|
let mut c = TwoTierCache::new(2, 2);
|
||||||
|
|
||||||
|
assert!(c.insert_blocks_into_l0(&[1, 2]).is_empty());
|
||||||
|
let changes = c.insert_blocks_into_l0(&[3]);
|
||||||
|
|
||||||
|
assert!(c.l0.contains(2));
|
||||||
|
assert!(c.l0.contains(3));
|
||||||
|
assert!(!c.l0.contains(1));
|
||||||
|
assert!(c.l1.contains(1));
|
||||||
|
assert_eq!(changes, vec![L1Change::Added(1)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn promoting_l1_blocks_to_l0_keeps_tiers_exclusive() {
|
||||||
|
let mut c = TwoTierCache::new(2, 2);
|
||||||
|
c.insert_blocks_into_l0(&[1, 2, 3]);
|
||||||
|
|
||||||
|
let changes = c.promote_l1_blocks_to_l0(&[1]);
|
||||||
|
|
||||||
|
assert!(c.l0.contains(1));
|
||||||
|
assert!(c.l0.contains(3));
|
||||||
|
assert!(!c.l0.contains(2));
|
||||||
|
assert!(!c.l1.contains(1));
|
||||||
|
assert!(c.l1.contains(2));
|
||||||
|
assert_eq!(changes, vec![L1Change::Removed(1), L1Change::Added(2)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn reinserting_block_into_l0_removes_duplicate_from_l1() {
|
||||||
|
let mut c = TwoTierCache::new(2, 2);
|
||||||
|
c.insert_blocks_into_l0(&[1, 2, 3]);
|
||||||
|
|
||||||
|
let changes = c.insert_blocks_into_l0(&[1]);
|
||||||
|
|
||||||
|
assert!(c.l0.contains(1));
|
||||||
|
assert!(c.l0.contains(3));
|
||||||
|
assert!(!c.l1.contains(1));
|
||||||
|
assert!(c.l1.contains(2));
|
||||||
|
assert_eq!(changes, vec![L1Change::Removed(1), L1Change::Added(2)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn remote_fetch_uses_l1_capacity_before_promoting_to_l0() {
|
||||||
|
let mut c = TwoTierCache::new(2, 1);
|
||||||
|
c.insert_blocks_into_l0(&[1, 2, 3]);
|
||||||
|
|
||||||
|
let changes = c.fetch_remote_blocks_to_l0(&[4]);
|
||||||
|
|
||||||
|
assert!(c.l0.contains(3));
|
||||||
|
assert!(c.l0.contains(4));
|
||||||
|
assert!(!c.l1.contains(1));
|
||||||
|
assert!(c.l1.contains(2));
|
||||||
|
assert_eq!(changes, vec![L1Change::Removed(1), L1Change::Added(2)]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,6 +22,8 @@ fn base_config(trace_path: &str, out_dir: &str, mode: RouterMode) -> Config {
|
|||||||
},
|
},
|
||||||
hardware: HardwareConfig {
|
hardware: HardwareConfig {
|
||||||
gpu_flops: 1.0e14,
|
gpu_flops: 1.0e14,
|
||||||
|
gpu_fp8_flops: 0.0,
|
||||||
|
gpu_fp4_flops: 0.0,
|
||||||
gpu_mem_bw: 1.0e12,
|
gpu_mem_bw: 1.0e12,
|
||||||
hbm_bytes: 1.0e9,
|
hbm_bytes: 1.0e9,
|
||||||
dram_bytes: 4.0e9,
|
dram_bytes: 4.0e9,
|
||||||
|
|||||||
Reference in New Issue
Block a user