{
  "_note": "Canonical benchmark data for the blog. Source: tco-analysis/results/<model>/summary.txt, measured on TPU v6e-4, vLLM 0.20.0 (tpu-inference), KV cache fp8_e5m2. prefill_tok_s = ctx / p50_ttft; decode agg_tok_s = bs*1000/p50_tpot; e2e values are vLLM bench serve reported throughput/latency.",
  "platform": "TPU v6e-4 (4 chips, 1 VM), vLLM 0.20.0 / tpu-inference, KV=fp8_e5m2",
  "models": {
    "Qwen3.5-4B": {
      "label": "Qwen3.5-4B (dense, 4B active)", "parallel": "tp1", "chips": 1, "config": "GDN, MBT2048",
      "prefill": [
        {"ctx": 512,  "p50_ttft_ms": 432.43, "p99_ttft_ms": 433.28, "prefill_tok_s": 1184.0},
        {"ctx": 1024, "p50_ttft_ms": 452.04, "p99_ttft_ms": 453.11, "prefill_tok_s": 2265.3},
        {"ctx": 2048, "p50_ttft_ms": 495.40, "p99_ttft_ms": 497.01, "prefill_tok_s": 4134.0},
        {"ctx": 4096, "p50_ttft_ms": 982.95, "p99_ttft_ms": 986.74, "prefill_tok_s": 4167.1},
        {"ctx": 8192, "p50_ttft_ms": 1961.47,"p99_ttft_ms": 1969.91,"prefill_tok_s": 4176.5}
      ],
      "decode": [
        {"ctx": 1024, "bs": 1,  "p50_tpot_ms": 10.72, "p99_tpot_ms": 10.72, "agg_tok_s": 93.3,  "req_s": 0.55},
        {"ctx": 1024, "bs": 4,  "p50_tpot_ms": 16.18, "p99_tpot_ms": 18.31, "agg_tok_s": 247.3, "req_s": 1.42},
        {"ctx": 1024, "bs": 16, "p50_tpot_ms": 41.39, "p99_tpot_ms": 647.30,"agg_tok_s": 386.6, "req_s": 0.99},
        {"ctx": 1024, "bs": 64, "p50_tpot_ms": 80.50, "p99_tpot_ms": 97.15, "agg_tok_s": 795.0, "req_s": 1.94},
        {"ctx": 4096, "bs": 1,  "p50_tpot_ms": 10.76, "p99_tpot_ms": 10.77, "agg_tok_s": 92.9,  "req_s": 0.43},
        {"ctx": 4096, "bs": 4,  "p50_tpot_ms": 25.32, "p99_tpot_ms": 34.06, "agg_tok_s": 158.0, "req_s": 0.71},
        {"ctx": 4096, "bs": 16, "p50_tpot_ms": 84.98, "p99_tpot_ms": 89.23, "agg_tok_s": 188.3, "req_s": 0.86},
        {"ctx": 4096, "bs": 64, "p50_tpot_ms": 84.97, "p99_tpot_ms": 89.23, "agg_tok_s": 753.2, "req_s": 0.86}
      ],
      "e2e": [
        {"rate": "0.2",  "req_s": 0.19, "out_tok_s": 193.6, "p50_ttft_ms": 509,  "p99_ttft_ms": 33147,  "p50_tpot_ms": 12.3,  "p99_tpot_ms": 15.4,  "p50_e2e_ms": 13083,  "p99_e2e_ms": 48882},
        {"rate": "0.3",  "req_s": 0.28, "out_tok_s": 282.7, "p50_ttft_ms": 507,  "p99_ttft_ms": 1185,   "p50_tpot_ms": 12.8,  "p99_tpot_ms": 14.3,  "p50_e2e_ms": 13665,  "p99_e2e_ms": 15097},
        {"rate": "0.4",  "req_s": 0.36, "out_tok_s": 367.0, "p50_ttft_ms": 506,  "p99_ttft_ms": 1099,   "p50_tpot_ms": 13.4,  "p99_tpot_ms": 15.7,  "p50_e2e_ms": 14369,  "p99_e2e_ms": 16598},
        {"rate": "0.45", "req_s": 0.40, "out_tok_s": 407.3, "p50_ttft_ms": 507,  "p99_ttft_ms": 1141,   "p50_tpot_ms": 13.8,  "p99_tpot_ms": 15.9,  "p50_e2e_ms": 14609,  "p99_e2e_ms": 16802},
        {"rate": "inf",  "req_s": 0.14, "out_tok_s": 143.0, "p50_ttft_ms": 6762, "p99_ttft_ms": 219855, "p50_tpot_ms": 133.8, "p99_tpot_ms": 237.2, "p50_e2e_ms": 206829, "p99_e2e_ms": 285887, "note": "single-chip 40-way concurrency collapse; not a clean saturation point"}
      ]
    },
    "Qwen3-30B-A3B": {
      "label": "Qwen3-30B-A3B (MoE, 3B active / 30B total)", "parallel": "tp4", "chips": 4, "config": "GMU0.8, MBT8192",
      "prefill": [
        {"ctx": 512,  "p50_ttft_ms": 35.02,  "p99_ttft_ms": 35.35,  "prefill_tok_s": 14618.4},
        {"ctx": 1024, "p50_ttft_ms": 50.97,  "p99_ttft_ms": 51.44,  "prefill_tok_s": 20090.6},
        {"ctx": 2048, "p50_ttft_ms": 84.02,  "p99_ttft_ms": 85.46,  "prefill_tok_s": 24374.3},
        {"ctx": 4096, "p50_ttft_ms": 157.16, "p99_ttft_ms": 158.49, "prefill_tok_s": 26063.1},
        {"ctx": 8192, "p50_ttft_ms": 317.92, "p99_ttft_ms": 322.08, "prefill_tok_s": 25767.7}
      ],
      "decode": [
        {"ctx": 1024, "bs": 1,  "p50_tpot_ms": 6.98,  "p99_tpot_ms": 7.10,  "agg_tok_s": 143.4,  "req_s": 1.07},
        {"ctx": 1024, "bs": 4,  "p50_tpot_ms": 8.78,  "p99_tpot_ms": 9.93,  "agg_tok_s": 455.5,  "req_s": 3.12},
        {"ctx": 1024, "bs": 16, "p50_tpot_ms": 14.94, "p99_tpot_ms": 17.47, "agg_tok_s": 1071.3, "req_s": 3.86},
        {"ctx": 1024, "bs": 64, "p50_tpot_ms": 48.10, "p99_tpot_ms": 152.69,"agg_tok_s": 1330.5, "req_s": 7.50},
        {"ctx": 4096, "bs": 1,  "p50_tpot_ms": 6.99,  "p99_tpot_ms": 7.11,  "agg_tok_s": 143.0,  "req_s": 0.96},
        {"ctx": 4096, "bs": 4,  "p50_tpot_ms": 9.74,  "p99_tpot_ms": 12.94, "agg_tok_s": 410.7,  "req_s": 2.30},
        {"ctx": 4096, "bs": 16, "p50_tpot_ms": 22.27, "p99_tpot_ms": 28.53, "agg_tok_s": 718.5,  "req_s": 4.12},
        {"ctx": 4096, "bs": 64, "p50_tpot_ms": 96.76, "p99_tpot_ms": 213.91,"agg_tok_s": 661.4,  "req_s": 4.25}
      ],
      "e2e": [
        {"rate": "0.2", "req_s": 0.19, "out_tok_s": 197.3,  "p50_ttft_ms": 90,  "p99_ttft_ms": 2682, "p50_tpot_ms": 7.9,  "p99_tpot_ms": 9.0,  "p50_e2e_ms": 8233,  "p99_e2e_ms": 11090},
        {"rate": "0.4", "req_s": 0.36, "out_tok_s": 365.9,  "p50_ttft_ms": 92,  "p99_ttft_ms": 190,  "p50_tpot_ms": 9.2,  "p99_tpot_ms": 10.1, "p50_e2e_ms": 9472,  "p99_e2e_ms": 10444},
        {"rate": "0.6", "req_s": 0.52, "out_tok_s": 536.6,  "p50_ttft_ms": 94,  "p99_ttft_ms": 1331, "p50_tpot_ms": 11.2, "p99_tpot_ms": 13.8, "p50_e2e_ms": 11572, "p99_e2e_ms": 14202},
        {"rate": "0.8", "req_s": 0.62, "out_tok_s": 635.3,  "p50_ttft_ms": 116, "p99_ttft_ms": 6335, "p50_tpot_ms": 18.8, "p99_tpot_ms": 28.7, "p50_e2e_ms": 24647, "p99_e2e_ms": 29578},
        {"rate": "inf", "req_s": 1.27, "out_tok_s": 1303.0, "p50_ttft_ms": 712, "p99_ttft_ms": 1092, "p50_tpot_ms": 22.3, "p99_tpot_ms": 22.8, "p50_e2e_ms": 23558, "p99_e2e_ms": 23572}
      ]
    },
    "Qwen3-32B": {
      "label": "Qwen3-32B (dense, 32B active)", "parallel": "tp4", "chips": 4, "config": "GMU0.8, MBT8192",
      "prefill": [
        {"ctx": 512,  "p50_ttft_ms": 37.81,  "p99_ttft_ms": 38.13,  "prefill_tok_s": 13540.9},
        {"ctx": 1024, "p50_ttft_ms": 57.62,  "p99_ttft_ms": 58.22,  "prefill_tok_s": 17772.4},
        {"ctx": 2048, "p50_ttft_ms": 104.24, "p99_ttft_ms": 105.61, "prefill_tok_s": 19647.8},
        {"ctx": 4096, "p50_ttft_ms": 207.77, "p99_ttft_ms": 209.71, "prefill_tok_s": 19714.3},
        {"ctx": 8192, "p50_ttft_ms": 460.58, "p99_ttft_ms": 463.20, "prefill_tok_s": 17786.2}
      ],
      "decode": [
        {"ctx": 1024, "bs": 1,  "p50_tpot_ms": 17.90, "p99_tpot_ms": 17.93, "agg_tok_s": 55.9,   "req_s": 0.43},
        {"ctx": 1024, "bs": 4,  "p50_tpot_ms": 18.20, "p99_tpot_ms": 19.39, "agg_tok_s": 219.8,  "req_s": 1.58},
        {"ctx": 1024, "bs": 16, "p50_tpot_ms": 22.37, "p99_tpot_ms": 27.41, "agg_tok_s": 715.1,  "req_s": 4.57},
        {"ctx": 1024, "bs": 64, "p50_tpot_ms": 58.40, "p99_tpot_ms": 65.28, "agg_tok_s": 1096.0, "req_s": 7.59},
        {"ctx": 4096, "bs": 1,  "p50_tpot_ms": 19.06, "p99_tpot_ms": 19.25, "agg_tok_s": 52.5,   "req_s": 0.40},
        {"ctx": 4096, "bs": 4,  "p50_tpot_ms": 21.44, "p99_tpot_ms": 25.18, "agg_tok_s": 186.5,  "req_s": 1.20},
        {"ctx": 4096, "bs": 16, "p50_tpot_ms": 37.71, "p99_tpot_ms": 130.61,"agg_tok_s": 424.3,  "req_s": 1.82},
        {"ctx": 4096, "bs": 64, "p50_tpot_ms": 82.54, "p99_tpot_ms": 92.82, "agg_tok_s": 775.4,  "req_s": 2.84}
      ],
      "e2e": [
        {"rate": "0.15", "req_s": 0.14, "out_tok_s": 143.8, "p50_ttft_ms": 115,  "p99_ttft_ms": 2243, "p50_tpot_ms": 18.0, "p99_tpot_ms": 18.3, "p50_e2e_ms": 18542, "p99_e2e_ms": 20566},
        {"rate": "0.3",  "req_s": 0.26, "out_tok_s": 270.5, "p50_ttft_ms": 115,  "p99_ttft_ms": 1042, "p50_tpot_ms": 18.9, "p99_tpot_ms": 20.1, "p50_e2e_ms": 19493, "p99_e2e_ms": 20698},
        {"rate": "0.45", "req_s": 0.37, "out_tok_s": 379.8, "p50_ttft_ms": 116,  "p99_ttft_ms": 295,  "p50_tpot_ms": 19.5, "p99_tpot_ms": 20.9, "p50_e2e_ms": 20201, "p99_e2e_ms": 21492},
        {"rate": "0.6",  "req_s": 0.45, "out_tok_s": 465.8, "p50_ttft_ms": 123,  "p99_ttft_ms": 3983, "p50_tpot_ms": 21.0, "p99_tpot_ms": 28.1, "p50_e2e_ms": 21651, "p99_e2e_ms": 28822},
        {"rate": "inf",  "req_s": 0.88, "out_tok_s": 901.1, "p50_ttft_ms": 1190, "p99_ttft_ms": 9249, "p50_tpot_ms": 43.2, "p99_tpot_ms": 44.0, "p50_e2e_ms": 45401, "p99_e2e_ms": 45443}
      ]
    }
  },
  "headline": {
    "moe_vs_dense_decode_tpot": "30B-A3B 7.0ms vs 32B 17.9ms @ bs1 (~2.5x)",
    "moe_vs_dense_e2e_capacity": "30B-A3B 1.27 req/s vs 32B 0.88 req/s (~1.4x); out_tok/s 1303 vs 901",
    "moe_prefill_peak": "30B-A3B 26063 tok/s vs 32B 19714 tok/s",
    "4b_concurrency_collapse": "4B tp1 inf-rate collapses (TPOT 134ms); sustainable ~0.45 req/s"
  }
}
