diff --git a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json index d88e29e..ba654e5 100644 --- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json +++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json @@ -26,7 +26,9 @@ "/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B" ], "base_envs": { - "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7" + "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "HOME": "/tmp/wjh", + "XDG_CACHE_HOME": "/tmp/wjh/.cache" }, "base_flags": { "host": "127.0.0.1", diff --git a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json index 9ac41a6..21051f9 100644 --- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json +++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json @@ -26,7 +26,9 @@ "/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B" ], "base_envs": { - "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7" + "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "HOME": "/tmp/wjh", + "XDG_CACHE_HOME": "/tmp/wjh/.cache" }, "base_flags": { "host": "127.0.0.1", diff --git a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md index 2cc3053..3834249 100644 --- a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md +++ b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md @@ -15,6 +15,8 @@ The comparison is: Both specs start from the same base vLLM configuration. The base contains only serving access fields: `host`, `port`, and `served-model-name`. It does not set performance flags such as TP, DP, EP, max model length, prefix cache, chunked prefill, max-num-seqs, max-num-batched-tokens, or gpu-memory-utilization. The first trial therefore measures community vLLM defaults for this model. +The launch environment sets `HOME=/tmp/wjh` and `XDG_CACHE_HOME=/tmp/wjh/.cache` so vLLM, torch.compile, and FlashInfer build caches land on dash0 local storage instead of CPFS. This is a startup/cache placement choice, not a vLLM performance flag. + ## vLLM Install PyPI reports `vllm==0.20.0` as the current community release checked on 2026-05-02. The dash0 runtime venv is on local rootfs rather than CPFS, because installing torch/CUDA wheels into CPFS was I/O-bound: