Skip to content

Commit 588ff05

Browse files
committed
Version: 0.10.16-alpha.2
Refactor DNN/transformer stack: BuildContext, chunked prefill - Major refactor: explicit BuildContext, TrainingMode, buffer mgmt - Enable efficient chunked prefill/inference in transformer/attention - Replace std::vector shapes with fixed-capacity TensorShape types - Add modular C++/CUDA attention, positional, and tensor op interfaces BREAKING CHANGE: Updates all component, tensor, and operator APIs to use BuildContext, TrainingMode, and shape_t; removes legacy interfaces and changes model/component build and training lifecycle. Existing code must be updated for new APIs.
1 parent 60f7914 commit 588ff05

134 files changed

Lines changed: 6592 additions & 3429 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakePresets.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@
6767
"strategy": "external"
6868
},
6969
"cacheVariables": {
70-
"CMAKE_BUILD_TYPE": "Debug"
70+
"CMAKE_BUILD_TYPE": "Debug",
71+
"CMAKE_CUDA_FLAGS_DEBUG": "-G"
7172
}
7273
},
7374
{

Data/Scripts/Gpt2/gpt2_output.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import torch
2+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
3+
4+
model = GPT2LMHeadModel.from_pretrained('gpt2')
5+
model.eval()
6+
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
7+
8+
# Same input as your Mila test
9+
input_text = "Once upon a time"
10+
input_ids = tokenizer.encode(input_text, return_tensors='pt')
11+
print(f"Token ids: {input_ids}")
12+
13+
# Storage for hook outputs
14+
captured = {}
15+
16+
def make_hook(name):
17+
def fn(module, input, output):
18+
t = output if isinstance(output, torch.Tensor) else output[0]
19+
captured[name] = (t.min().item(), t.max().item(), t.shape)
20+
return fn
21+
22+
hooks = []
23+
24+
# Encoder output (wte + wpe)
25+
hooks.append(model.transformer.drop.register_forward_hook(make_hook('lenc_out')))
26+
27+
for i, block in enumerate(model.transformer.h):
28+
# ln_1
29+
hooks.append(block.ln_1.register_forward_hook(make_hook(f'layer_{i}.ln_1')))
30+
# fc_qkv_proj (c_attn)
31+
hooks.append(block.attn.c_attn.register_forward_hook(make_hook(f'layer_{i}.fc_qkv_proj')))
32+
# fc_out_proj (c_proj)
33+
hooks.append(block.attn.c_proj.register_forward_hook(make_hook(f'layer_{i}.fc_out_proj')))
34+
# ln_2
35+
hooks.append(block.ln_2.register_forward_hook(make_hook(f'layer_{i}.ln_2')))
36+
# mlp.fc_1 (c_fc)
37+
hooks.append(block.mlp.c_fc.register_forward_hook(make_hook(f'layer_{i}.mlp.fc_1')))
38+
# gelu
39+
hooks.append(block.mlp.act.register_forward_hook(make_hook(f'layer_{i}.mlp.gelu')))
40+
# mlp.fc_2 (c_proj)
41+
hooks.append(block.mlp.c_proj.register_forward_hook(make_hook(f'layer_{i}.mlp.fc_2')))
42+
# full block output (residual)
43+
hooks.append(block.register_forward_hook(make_hook(f'layer_{i}.residual_out')))
44+
45+
with torch.no_grad():
46+
model(input_ids)
47+
48+
for h in hooks:
49+
h.remove()
50+
51+
# Print in same format as your Mila debug output
52+
print(f"\nlenc out: [{captured['lenc_out'][0]:.3f}, {captured['lenc_out'][1]:.3f}]")
53+
print()
54+
55+
for i in range(12):
56+
for key in [f'layer_{i}.ln_1', f'layer_{i}.fc_qkv_proj', f'layer_{i}.fc_out_proj',
57+
f'layer_{i}.ln_2', f'layer_{i}.mlp.fc_1', f'layer_{i}.mlp.gelu',
58+
f'layer_{i}.mlp.fc_2', f'layer_{i}.residual_out']:
59+
if key in captured:
60+
mn, mx, shape = captured[key]
61+
print(f"{key}: [{mn:.3f}, {mx:.3f}] shape={list(shape)}")
62+
print()
63+
64+
# Print peak residual across all layers to set kResidualAbsLimit
65+
peak = max(abs(captured[f'layer_{i}.residual_out'][1]) for i in range(12))
66+
peak_min = min(captured[f'layer_{i}.residual_out'][0] for i in range(12))
67+
print(f"Peak residual: min={peak_min:.3f}, max={peak:.3f}")
68+
print(f"Suggested kResidualAbsLimit: {peak * 1.5:.1f}")
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from transformers import GPT2Tokenizer
2+
3+
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
4+
5+
text = "You are a helpful AI Assistant. Your name is Mila"
6+
tokens = tokenizer.encode(text)
7+
print(f"Token count: {len(tokens)}")
8+
print(f"Token IDs: {tokens}")
9+
print(f"\nDecoded back: '{tokenizer.decode(tokens)}'")
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import torch
2+
from transformers import GPT2LMHeadModel
3+
4+
model = GPT2LMHeadModel.from_pretrained('gpt2')
5+
6+
for i in [0, 1]:
7+
w = model.state_dict()[f'transformer.h.{i}.mlp.c_proj.weight'].T
8+
print(f"Layer {i} fc_2 after .T: min={w.min():.6f} max={w.max():.6f} mean={w.mean():.6f}")
9+
print(f"First 5x5:\n{w[:5, :5]}\n")

Data/Scripts/Gpt2/hf_decode.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import torch
2+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
3+
4+
model = GPT2LMHeadModel.from_pretrained( 'gpt2' )
5+
tokenizer = GPT2Tokenizer.from_pretrained( 'gpt2' )
6+
model.eval()
7+
8+
input_ids = tokenizer.encode( "Once upon a time", return_tensors='pt' )
9+
print( f"Token ids: {input_ids.tolist()}" )
10+
11+
with torch.no_grad():
12+
out = model( input_ids )
13+
14+
logits = out.logits[ 0 ] # [T, V]
15+
16+
print( "\n=== Token 11 logit at every position ===" )
17+
for pos in range( 4 ):
18+
print( f"HF token 11 (',') at pos {pos}: {logits[ pos, 11 ].item():.4f}" )
19+
20+
print( "\n=== Top token at every position ===" )
21+
for pos in range( 4 ):
22+
top_token = logits[ pos ].argmax().item()
23+
top_logit = logits[ pos, top_token ].item()
24+
print( f"HF top token at pos {pos}: token={top_token} '{tokenizer.decode([top_token])}' logit={top_logit:.4f}" )
25+
26+
print( f"\n=== Token 11 at pos 3 (expected ~-50.47) ===" )
27+
print( f"HF token 11 at pos 3: {logits[ 3, 11 ].item():.4f}" )
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
2+
import torch
3+
4+
model = GPT2LMHeadModel.from_pretrained("gpt2")
5+
tok = GPT2Tokenizer.from_pretrained("gpt2")
6+
7+
ids = tok.encode("Once upon a time", return_tensors="pt")
8+
out = model.generate(ids, max_new_tokens=64, do_sample=False) # greedy
9+
print(tok.decode(out[0]))

Data/Scripts/Gpt2/hf_mila_test.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import torch
2+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
3+
4+
model = GPT2LMHeadModel.from_pretrained( 'gpt2' )
5+
model.eval()
6+
tokenizer = GPT2Tokenizer.from_pretrained( 'gpt2' )
7+
8+
# This matches Mila's decode step 0:
9+
# - Prefill: "Once upon a time" (4 tokens)
10+
# - First decoded token: ',' (token 11)
11+
input_text = "Once upon a time,"
12+
input_ids = tokenizer.encode( input_text, return_tensors='pt' )
13+
print( f"Token ids: {input_ids}" )
14+
print( f"Seq len: {input_ids.shape[1]}" )
15+
16+
captured = {}
17+
18+
def make_hook_last( name ):
19+
"""Captures min/max of the LAST token position only."""
20+
def fn( module, input, output ):
21+
t = output if isinstance( output, torch.Tensor ) else output[0]
22+
last = t[0, -1, :]
23+
captured[name] = ( last.min().item(), last.max().item() )
24+
return fn
25+
26+
hooks = []
27+
for i, block in enumerate( model.transformer.h ):
28+
hooks.append( block.ln_1.register_forward_hook( make_hook_last( f'layer_{i}.ln_1' ) ) )
29+
hooks.append( block.attn.c_attn.register_forward_hook( make_hook_last( f'layer_{i}.fc_qkv_proj' ) ) )
30+
hooks.append( block.attn.c_proj.register_forward_hook( make_hook_last( f'layer_{i}.fc_out_proj' ) ) )
31+
hooks.append( block.ln_2.register_forward_hook( make_hook_last( f'layer_{i}.ln_2' ) ) )
32+
hooks.append( block.mlp.c_fc.register_forward_hook( make_hook_last( f'layer_{i}.mlp.fc_1' ) ) )
33+
hooks.append( block.mlp.act.register_forward_hook( make_hook_last( f'layer_{i}.mlp.gelu' ) ) )
34+
hooks.append( block.mlp.c_proj.register_forward_hook( make_hook_last( f'layer_{i}.mlp.fc_2' ) ) )
35+
hooks.append( block.register_forward_hook( make_hook_last( f'layer_{i}.residual_out' ) ) )
36+
37+
with torch.no_grad():
38+
out = model( input_ids )
39+
logits = out.logits[0, -1]
40+
top5 = torch.topk( logits, 5 )
41+
print( f"\nTop 5 predictions after '{input_text}':" )
42+
for v, idx in zip( top5.values, top5.indices ):
43+
print( f" {tokenizer.decode([idx.item()])!r:15} {v.item():.4f}" )
44+
45+
for h in hooks:
46+
h.remove()
47+
48+
print( "\n=== Per-layer values at LAST token position ===" )
49+
for i in range( 12 ):
50+
print( f"\nlayer_{i}:" )
51+
for key in [
52+
f'layer_{i}.ln_1',
53+
f'layer_{i}.fc_qkv_proj',
54+
f'layer_{i}.fc_out_proj',
55+
f'layer_{i}.ln_2',
56+
f'layer_{i}.mlp.fc_1',
57+
f'layer_{i}.mlp.gelu',
58+
f'layer_{i}.mlp.fc_2',
59+
f'layer_{i}.residual_out',
60+
]:
61+
if key in captured:
62+
mn, mx = captured[key]
63+
print( f" {key}: [{mn:.3f}, {mx:.3f}]" )

Data/Scripts/Gpt2/hf_vcache.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import torch
2+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
3+
4+
model = GPT2LMHeadModel.from_pretrained('gpt2')
5+
model.eval()
6+
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
7+
8+
input_text = "Once upon a time"
9+
input_ids = tokenizer.encode(input_text, return_tensors='pt')
10+
print(f"Token ids: {input_ids}")
11+
print(f"Seq len: {input_ids.shape[1]}")
12+
13+
captured = {}
14+
15+
def make_hook(name):
16+
def fn(module, input, output):
17+
captured[name] = output.detach()
18+
return fn
19+
20+
hooks = []
21+
hooks.append(model.transformer.h[0].attn.c_attn.register_forward_hook(make_hook('layer_0.qkv')))
22+
hooks.append(model.transformer.h[1].attn.c_attn.register_forward_hook(make_hook('layer_1.qkv')))
23+
24+
with torch.no_grad():
25+
model(input_ids)
26+
27+
for h in hooks:
28+
h.remove()
29+
30+
def extract_v( qkv, layer_name, num_heads=12, head_size=64 ):
31+
# qkv shape: [1, T, 2304]
32+
T = qkv.shape[1]
33+
q, k, v = qkv.split( 768, dim=-1 ) # each [1, T, 768]
34+
# Reshape to [B, NH, T, HS]
35+
v = v.view( 1, T, num_heads, head_size ).permute( 0, 2, 1, 3 ) # [1, 12, T, 64]
36+
k = k.view( 1, T, num_heads, head_size ).permute( 0, 2, 1, 3 ) # [1, 12, T, 64]
37+
print( f"\n=== {layer_name} ===" )
38+
for head in range( 2 ): # show head 0 and head 1
39+
print( f"\n V head {head}, positions 0..{T-1} (first 8 elements each):" )
40+
for pos in range( T ):
41+
vals = v[0, head, pos, :8].tolist()
42+
formatted = " ".join( f"{x:10.6f}" for x in vals )
43+
print( f" pos {pos}: [ {formatted} ... ]" )
44+
print( f"\n K head {head}, positions 0..{T-1} (first 8 elements each):" )
45+
for pos in range( T ):
46+
vals = k[0, head, pos, :8].tolist()
47+
formatted = " ".join( f"{x:10.6f}" for x in vals )
48+
print( f" pos {pos}: [ {formatted} ... ]" )
49+
50+
extract_v( captured['layer_0.qkv'], 'Layer 0' )
51+
extract_v( captured['layer_1.qkv'], 'Layer 1' )

0 commit comments

Comments
 (0)