-
Notifications
You must be signed in to change notification settings - Fork 41
Expand file tree
/
Copy pathbasin_compat.sh
More file actions
executable file
·116 lines (101 loc) · 4.18 KB
/
basin_compat.sh
File metadata and controls
executable file
·116 lines (101 loc) · 4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/bin/bash
# tools/basin_compat.sh — Engine FP32 basin compatibility benchmark
#
# Runs identical prompt through our engine (TQ_LAYER_TRACE) and llama-debug
# (--verbose --tensor-filter ^l_out-) on the same GGUF model. Reports
# per-layer residual-sum divergence and assigns a Tier classification.
#
# Tier 1 (Production): all layers rel_diff < 5%
# Tier 2 (Research grade): late layers 10-40% rel_diff
# Tier 3 (Needs research): early or persistent >50%
#
# See docs/engine_basin_tiers.md for the theory.
#
# CAVEAT: this tool is designed for hybrid DeltaNet/self-attn MoE models
# (like Qwen3.6-A3B) where llama-debug emits per-layer N=1 decode dumps.
# For pure feedforward models (Llama, Phi, Gemma), llama-debug only dumps
# N=1 on the FINAL layer (due to ggml's get_rows optimization), so
# per-layer comparison is limited. Use paired-diff alternative tools for
# those architectures (see docs/custom-quantization.md).
#
# Usage:
# tools/basin_compat.sh models/<model>.gguf
# tools/basin_compat.sh models/<model>.gguf "Hello" # custom prompt
set -euo pipefail
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
MODEL="${1:?usage: basin_compat.sh <model.gguf> [prompt]}"
PROMPT="${2:-Hello}"
OUT="${OUT:-/tmp/basin_compat}"
mkdir -p "$OUT"
name=$(basename "$MODEL" .gguf)
echo "== basin compatibility =="
echo "model: $name"
echo "prompt: $PROMPT"
echo
echo "→ running ours (TQ_LAYER_TRACE)..."
TQ_LAYER_TRACE=1 \
"$ROOT/build/quant" "$MODEL" \
-p "$PROMPT" -n 1 -T 0 -j 1 \
2>"$OUT/$name.ours.stderr" >/dev/null
echo "→ running llama-debug (--tensor-filter ^l_out-)..."
pkill -9 -f "llama-debug" 2>/dev/null || true; sleep 1
"$ROOT/refs/llama.cpp/build/bin/llama-debug" \
-m "$MODEL" \
-p "$PROMPT" \
--verbose --tensor-filter "^l_out-" \
-n 1 --temp 0 -t 1 --ctx-size 128 \
--device none -fit off --no-op-offload \
2>"$OUT/$name.llama.stderr" >"$OUT/$name.llama.stdout"
python3 - <<EOF
import re, sys
ours_lout = {}
for line in open("$OUT/$name.ours.stderr"):
m = re.match(r'\[trace\] l_out-(\d+) pos=(\d+) sum=([\-\d\.]+)', line)
if m: ours_lout.setdefault(int(m.group(1)), {})[int(m.group(2))] = float(m.group(3))
if not ours_lout:
print("error: no layer trace from ours — is TQ_LAYER_TRACE supported on this model?")
sys.exit(1)
positions = sorted({p for v in ours_lout.values() for p in v})
pos_use = positions[0]
llama_de = {}
cur, N = None, None
for line in open("$OUT/$name.llama.stdout"):
m = re.match(r'common_debug_cb_eval:\s+l_out-(\d+) = \(f32\)\s+(?:ADD|DUP|VIEW)\([^{]+\{[^,]+, (\d+)', line)
if m: cur = int(m.group(1)); N = int(m.group(2)); continue
ms = re.match(r'\s+sum\s*=\s*([\-\d\.]+)', line)
if ms and cur is not None and N == 1:
llama_de[cur] = float(ms.group(1)); cur = None
if not llama_de:
print("error: no layer dump from llama-debug")
sys.exit(1)
n_layers = max(max(ours_lout.keys()), max(llama_de.keys())) + 1
layer_diffs = []
max_rel = 0.0
tier1_threshold = 0.05 # 5%
tier2_threshold = 0.50 # 50%
for L in range(n_layers):
ov = ours_lout.get(L, {}).get(pos_use); ld = llama_de.get(L)
if ov is None or ld is None: continue
rd = abs(ov - ld) / max(abs(ld), 1e-6)
layer_diffs.append((L, ov, ld, rd))
if rd > max_rel: max_rel = rd
print(f"{'Layer':>5} {'ours':>12} {'llama':>12} {'rel_diff':>10}")
for L, ov, ld, rd in layer_diffs:
mark = "**" if rd > tier1_threshold else ""
print(f"{L:>5} {ov:>12.4f} {ld:>12.4f} {rd:>10.4f} {mark}")
# Classification
late_max = max((rd for L, _, _, rd in layer_diffs if L >= n_layers - 5), default=0.0)
early_max = max((rd for L, _, _, rd in layer_diffs if L < n_layers // 2), default=0.0)
print(f"\n== Summary ==")
print(f"layers measured: {len(layer_diffs)} / {n_layers}")
print(f"max rel_diff overall: {max_rel:.4f}")
print(f"max rel_diff early (L < {n_layers//2}): {early_max:.4f}")
print(f"max rel_diff late (last 5 layers): {late_max:.4f}")
if max_rel < tier1_threshold:
tier = "Tier 1 — Production quality (all layers within 5%)"
elif late_max < tier2_threshold and early_max < 0.20:
tier = "Tier 2 — Research grade (late-layer drift, early layers stable)"
else:
tier = "Tier 3 — Needs research (early or persistent divergence)"
print(f"\n=> {tier}")
EOF