Strictly more general shuffle flattening in Hexagon. (#8979)

mcourteaux · web-flow · commit eacc40e7563a · 2026-03-06T15:00:55.000+01:00
* Strictly more general shuffle flattening in Hexagon.
Vdelta fix doing weird things with replicating don't cares.

* clang-format
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
@@ -1186,15 +1186,16 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
                 create_bitcast(a_call->getArgOperand(1), native_ty),
                 create_bitcast(a_call->getArgOperand(0), native_ty), indices);
         } else if (ShuffleVectorInst *a_shuffle = dyn_cast<ShuffleVectorInst>(a)) {
-            bool is_identity = true;
-            for (int i = 0; i < a_elements; i++) {
-                int mask_i = a_shuffle->getMaskValue(i);
-                is_identity = is_identity && (mask_i == i || mask_i == -1);
-            }
-            if (is_identity) {
-                return shuffle_vectors(a_shuffle->getOperand(0),
-                                       a_shuffle->getOperand(1), indices);
+            std::vector<int> new_indices(indices.size());
+            for (size_t i = 0; i < indices.size(); i++) {
+                if (indices[i] != -1) {
+                    new_indices[i] = a_shuffle->getMaskValue(indices[i]);
+                } else {
+                    new_indices[i] = -1;
+                }
             }
+            return shuffle_vectors(a_shuffle->getOperand(0),
+                                   a_shuffle->getOperand(1), new_indices);
         }
     }
 
@@ -1516,7 +1517,11 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector<int> &indices) {
         vector<int> i8_indices(indices.size() * replicate);
         for (size_t i = 0; i < indices.size(); i++) {
             for (int j = 0; j < replicate; j++) {
-                i8_indices[i * replicate + j] = indices[i] * replicate + j;
+                if (indices[i] == -1) {
+                    i8_indices[i * replicate + j] = -1;  // Replicate the don't-care.
+                } else {
+                    i8_indices[i * replicate + j] = indices[i] * replicate + j;
+                }
             }
         }
         Value *result = vdelta(i8_lut, i8_indices);
diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
@@ -54,16 +54,24 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
             isa_version = 62;
         }
 
+        auto valign_test_u8 = [&](int off) {
+            return in_u8(x + off) + in_u8(x + off + 1);
+        };
+
+        auto valign_test_u16 = [&](int off) {
+            return in_u16(x + off) + in_u16(x + off + 1);
+        };
+
         // Verify that unaligned loads use the right instructions, and don't try to use
         // immediates of more than 3 bits.
-        check("valign(v*,v*,#7)", hvx_width / 1, in_u8(x + 7));
-        check("vlalign(v*,v*,#7)", hvx_width / 1, in_u8(x + hvx_width - 7));
-        check("valign(v*,v*,r*)", hvx_width / 1, in_u8(x + 8));
-        check("valign(v*,v*,r*)", hvx_width / 1, in_u8(x + hvx_width - 8));
-        check("valign(v*,v*,#6)", hvx_width / 1, in_u16(x + 3));
-        check("vlalign(v*,v*,#6)", hvx_width / 1, in_u16(x + hvx_width - 3));
-        check("valign(v*,v*,r*)", hvx_width / 1, in_u16(x + 4));
-        check("valign(v*,v*,r*)", hvx_width / 1, in_u16(x + hvx_width - 4));
+        check("valign(v*,v*,#7)", hvx_width / 1, valign_test_u8(6));
+        check("vlalign(v*,v*,#7)", hvx_width / 1, valign_test_u8(hvx_width - 7));
+        check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u8(8));
+        check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u8(hvx_width - 8));
+        check("valign(v*,v*,#6)", hvx_width / 1, valign_test_u16(3));
+        check("vlalign(v*,v*,#6)", hvx_width / 1, valign_test_u16(hvx_width - 3));
+        check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u16(4));
+        check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u16(hvx_width - 4));
 
         check("vunpack(v*.ub)", hvx_width / 1, u16(u8_1));
         check("vunpack(v*.ub)", hvx_width / 1, i16(u8_1));