opt indexer

grimoire · grimoire · commit a0ec686b7f99 · 2026-05-11T11:19:52.000+08:00
diff --git a/lmdeploy/pytorch/backends/cuda/attention/v4.py b/lmdeploy/pytorch/backends/cuda/attention/v4.py
@@ -363,7 +363,7 @@ def _forward_decoding(self, query, kv, attn_sink, attn_metadata: CudaV4Attention
         if self.compress_ratio:
             compressed_cache_fp8 = caches['compressed_kv_fp8']
             if index_out is not None:
-                indices_in_kvcache = index_out.indices_in_kvcache
+                indices_in_kvcache = index_out.indices_in_kvcache.unsqueeze(1)  # [bsz, 1, topk_width]
                 topk_length = index_out.topk_length
             elif self.compress_ratio == 4:
                 indices_in_kvcache = attn_metadata.compress_fallback_indices_r4
@@ -440,7 +440,7 @@ def _select_compress_topk(self, index_out, attn_metadata: CudaV4AttentionMetadat
             return None, None
 
         if index_out is not None:
-            compress_topk = index_out.indices_in_kvcache.squeeze(0)
+            compress_topk = index_out.indices_in_kvcache
             # Offset indexer's logical indices into flat_kv positions
             uncompressed_kv_lens = attn_metadata.prefill_uncompressed_kv_lens
             cu_q_seqlens = attn_metadata.cu_q_seqlens
diff --git a/lmdeploy/pytorch/backends/cuda/v4_indexer.py b/lmdeploy/pytorch/backends/cuda/v4_indexer.py
@@ -34,36 +34,32 @@ def forward(self,
         block_offsets = meta.block_offsets
         cu_q_seqlens = meta.cu_q_seqlens
         kv_seqlens = meta.kv_seqlens
-        is_decoding = meta.is_decoding
         q_seqlens = meta.q_seqlens
         bsz = kv_seqlens.size(0)
         block_size = self._block_size
 
-        # quant query
-        # FP8 quantize Indexer Q (replaces fp4_act_quant for better precision)
-        # we might need to do quant fp4 in the future
-        q_2d = query.reshape(-1, query.size(-1) * query.size(-2))
+        # Reshape to fp8_index expected layout upfront.
+        # query: [bsz, seqlen, n_heads, head_dim] -> [cum_seqlen, n_heads, head_dim]
+        # weights: [bsz, seqlen, n_heads] -> [cum_seqlen, n_heads]
+        q_3d = query.flatten(0, 1)
+        weights_2d = weights.flatten(0, -2)
+
+        # FP8 quantize Indexer Q directly on 3D (replaces fp4_act_quant for better precision)
+        q_2d = q_3d.reshape(-1, q_3d.size(-1) * q_3d.size(-2))
         q_fp8, q_scale_2d = quant_fp8(q_2d, group_size=128,
                                        dtype=torch.float8_e4m3fn, scale_fmt='ue8m0')
-        query = q_fp8.view_as(query)
-        q_scale = q_scale_2d.view(query.shape[:-1])
-
-        # reshape q and weights
-        q_3d = query.flatten(0, 1)
-        q_scale = q_scale.flatten(0, -2)
-        weights = weights.flatten(0, -2)
-        q_scale_weighted = q_scale * weights  # [bsz, n_heads]
+        q_3d = q_fp8.view_as(q_3d)
+        q_scale = q_scale_2d.view(q_3d.shape[:-1])  # [cum_seqlen, n_heads]
+        q_scale_weighted = q_scale * weights_2d
 
         total_lens = kv_seqlens
         num_index = torch.div(total_lens, self.compress_ratio, rounding_mode='floor')
         max_kv_seqlen = meta.max_kv_seqlen if meta.max_kv_seqlen is not None else block_offsets.size(1) * block_size
         max_index = max(max_kv_seqlen // self.compress_ratio, 1)
 
         if max_index == 0:
-            if is_decoding:
-                empty = query.new_empty((1, bsz, 0), dtype=torch.long)
-            else:
-                empty = query.new_empty((bsz, 1, 0), dtype=torch.long)
+            total_q = q_3d.size(0)
+            empty = query.new_empty((total_q, 0), dtype=torch.long)
             return V4IndexerOutput(indices_in_kvcache=empty,
                                    topk_length=num_index.new_zeros((bsz,), dtype=torch.int32))
 
@@ -87,11 +83,8 @@ def forward(self,
         else:
             topk = scores.topk(topk_width, dim=-1)[1]
 
-        if is_decoding:
-            topk = topk.unsqueeze(1)  # [bsz, 1, topk_width]
-            return V4IndexerOutput(indices_in_kvcache=topk, topk_length=topk_length)
-        else:
-            return V4IndexerOutput(indices_in_kvcache=topk.unsqueeze(0), topk_length=topk_length)
+        # Always return [total_q, topk_width] — caller handles decode/prefill dimension adaptation
+        return V4IndexerOutput(indices_in_kvcache=topk, topk_length=topk_length)
 
 
 class TritonV4IndexerBuilder(BaseV4IndexerBuilder):
diff --git a/lmdeploy/pytorch/models/deepseek_v4.py b/lmdeploy/pytorch/models/deepseek_v4.py
@@ -237,7 +237,6 @@ def forward(self,
         kv_rope = compressed_kv[..., -rd:].unsqueeze(1)  # [total_flat, 1, rd]
         cos_c, sin_c = compress_pos_emb
         self.apply_rotary.forward_single(kv_rope, cos_c, sin_c, inplace=True, complex_mode=True)
-        compressed_kv[..., -rd:] = kv_rope.squeeze(1)
         if self.rotate:
             compressed_kv = self.compressor_impl.rotate_activation(compressed_kv)
         else:
diff --git a/lmdeploy/pytorch/nn/rotary_embedding.py b/lmdeploy/pytorch/nn/rotary_embedding.py
@@ -270,8 +270,10 @@ def forward_single(self, x: Tensor, cos: Tensor, sin: Tensor, inplace: bool = Tr
         dummy_k = x_3d.new_empty(x_3d.size(0), 0, dummy_dim)
         x_3d, _ = self.forward(x_3d, dummy_k, cos, sin, inplace=False,
                                complex_mode=complex_mode)
-        x.copy_(x_3d.reshape(orig_shape))
-        return x
+        if inplace:
+            x.copy_(x_3d.reshape(orig_shape))
+            return x
+        return x_3d.reshape(orig_shape)
 
 
 class FopeRotaryEmbedding(nn.Module):