Use torch.gather instead of generic int indexing for cross_entropy example

AmesingFlank · AmesingFlank · commit efbe2a5d0788 · 2026-04-20T19:39:23.000Z
stack-info: PR: #2058, branch: AmesingFlank/stack/25
diff --git a/examples/cross_entropy.py b/examples/cross_entropy.py
@@ -48,24 +48,16 @@ def cross_entropy(
     n, v = logits.shape
     losses = torch.zeros([n], dtype=logits.dtype, device=logits.device)
 
-    # Flatten logits once at the beginning
-    logits_flat = logits.view(-1)
-
     for tile_n in hl.tile(n):
         # Get data for this tile
         labels_tile = labels[tile_n]  # [tile_size]
-        base_indices_tile = tile_n.index * v  # [tile_size]
-
-        # Compute the actual flat indices by adding the label offset
-        flat_indices = base_indices_tile + labels_tile
-
-        # Load the logits at the target indices
-        logits_at_target = hl.load(logits_flat, [flat_indices])
 
         # Compute log_softmax for numerical stability
         # Load the full rows for this tile
         logits_rows = logits[tile_n, :]  # [tile_size, V]
 
+        logits_at_target = logits_rows.gather(1, labels_tile.unsqueeze(1)).squeeze(1)
+
         # Compute log-sum-exp
         max_logits = torch.amax(logits_rows, dim=-1, keepdim=True)
         shifted = logits_rows - max_logits
@@ -89,7 +81,7 @@ def main() -> None:
     """
     Main entry point that runs the cross entropy kernel verification.
     """
-    batch_size, seq_len, vocab_size = 8, 2048, 131072
+    batch_size, seq_len, vocab_size = 8, 2048, 2048
     n = batch_size * seq_len
     logits = torch.randn(n, vocab_size, device=DEVICE, dtype=torch.float32)
     labels = torch.randint(0, vocab_size, (n,), device=DEVICE, dtype=LONG_INT_TYPE)