Skip to content

Commit 1d5c1e2

Browse files
authored
Move ARM Linux workflows to GitHub Actions (#8977)
* ci: add ARM Linux GitHub Actions workflow * python: pin arm32 wheels from piwheels in CI environment * arm32: fix toolchain/runtime compatibility and builtins mapping * codegen: fix ARM SVE2 lowering and bool vector handling * tests: update ARM/SVE2 coverage and LLVM-version-specific skips * tests: run autoschedulers_cpu serially
1 parent 560e7c1 commit 1d5c1e2

36 files changed

Lines changed: 808 additions & 81 deletions
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
name: ARM Linux
2+
3+
on:
4+
pull_request:
5+
types: [ opened, synchronize, reopened ]
6+
workflow_dispatch:
7+
8+
concurrency:
9+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
10+
cancel-in-progress: true
11+
12+
permissions:
13+
contents: read
14+
15+
jobs:
16+
arm-linux:
17+
name: arm-${{ matrix.bits }} / ${{ matrix.uv_group }}
18+
runs-on: ubuntu-24.04-arm
19+
strategy:
20+
fail-fast: false
21+
matrix:
22+
bits: [ "64", "32" ]
23+
uv_group: [ "ci-llvm-main", "ci-llvm-22", "ci-llvm-21", "ci-llvm-20" ]
24+
include:
25+
- bits: 32
26+
arch: armv7l
27+
python: 3.11-armv7-gnueabihf # needed for piwheels
28+
- bits: 64
29+
arch: aarch64
30+
python: linux-aarch64-gnu
31+
32+
steps:
33+
- uses: actions/checkout@v4
34+
35+
- uses: astral-sh/setup-uv@v5
36+
37+
- name: Install system dependencies
38+
run: |
39+
if [[ "${{ matrix.bits }}" == "32" ]]; then
40+
sudo dpkg --add-architecture armhf
41+
fi
42+
43+
apt_update() {
44+
for i in 1 2 3; do
45+
if sudo apt-get update; then return 0; fi
46+
echo "apt-get update failed (attempt $i/3), retrying in 10s..."
47+
sleep 10
48+
done
49+
return 1
50+
}
51+
apt_update
52+
53+
if [[ "${{ matrix.bits }}" == "32" ]]; then
54+
sudo apt-get install -y \
55+
binutils-arm-linux-gnueabihf \
56+
g++-arm-linux-gnueabihf \
57+
gcc-arm-linux-gnueabihf \
58+
libc6:armhf \
59+
libstdc++6:armhf \
60+
libatomic1:armhf \
61+
libpng-dev:armhf \
62+
libjpeg-dev:armhf
63+
else
64+
sudo apt-get install -y \
65+
libpng-dev \
66+
libjpeg-dev
67+
fi
68+
69+
- name: Sync CI environment
70+
run: |
71+
setarch ${{ matrix.arch }} bash -ec "
72+
uv sync --python '${{ matrix.python }}' --group '${{ matrix.uv_group }}' --no-install-project
73+
echo '${GITHUB_WORKSPACE}/.venv/bin' >> '$GITHUB_PATH'
74+
echo 'VIRTUAL_ENV=${GITHUB_WORKSPACE}/.venv' >> '$GITHUB_ENV'
75+
"
76+
77+
- name: Configure LLVM
78+
run: echo "Halide_LLVM_ROOT=$(halide-llvm --prefix)" >> "$GITHUB_ENV"
79+
80+
- name: Configure CMake
81+
run: |
82+
TOOLCHAIN_ARGS=()
83+
if [[ "${{ matrix.bits }}" == "32" ]]; then
84+
TOOLCHAIN_ARGS+=("-DCMAKE_TOOLCHAIN_FILE=${GITHUB_WORKSPACE}/cmake/toolchain.linux-arm32.cmake")
85+
fi
86+
87+
cmake -G Ninja -S . -B build \
88+
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
89+
-DHalide_LLVM_ROOT="${Halide_LLVM_ROOT}" \
90+
-DWITH_PYTHON_BINDINGS=OFF \
91+
"${TOOLCHAIN_ARGS[@]}"
92+
93+
- name: Initial build
94+
run: cmake --build build
95+
96+
- name: Detect host target
97+
run: |
98+
HOST_TARGET=$(./build/src/autoschedulers/common/get_host_target)
99+
echo "HAS_SVE2=$([[ "$HOST_TARGET" == *sve2* ]] && echo true || echo false)" >> "$GITHUB_ENV"
100+
echo "Detected host target: ${HOST_TARGET}"
101+
102+
- name: Test (host)
103+
if: matrix.bits == '32' || env.HAS_SVE2 == 'true'
104+
run: |
105+
cmake -S . -B build -DHalide_TARGET=host
106+
cmake --build build
107+
ctest --test-dir build --build-config RelWithDebInfo --output-on-failure -j "$(nproc)"
108+
109+
- name: Test (NEON)
110+
if: matrix.bits == '64'
111+
run: |
112+
cmake -S . -B build -DHalide_TARGET=arm-64-linux-arm_dot_prod-arm_fp16
113+
cmake --build build
114+
ctest --test-dir build --build-config RelWithDebInfo --output-on-failure -j "$(nproc)"
115+
116+
- name: Test (no extensions)
117+
run: |
118+
cmake -S . -B build -DHalide_TARGET=cmake
119+
cmake --build build
120+
ctest --test-dir build --build-config RelWithDebInfo --output-on-failure -j "$(nproc)"

cmake/HalideTestHelpers.cmake

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,20 @@ function(add_halide_test TARGET)
6969
#
7070
# target_link_libraries("${TARGET}" PRIVATE Halide::TerminateHandler)
7171

72-
set_tests_properties(${TARGET} PROPERTIES
73-
LABELS "${args_GROUPS}"
74-
ENVIRONMENT "HL_TARGET=${Halide_TARGET};HL_JIT_TARGET=${Halide_TARGET}"
75-
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
76-
WILL_FAIL ${args_EXPECT_FAILURE})
72+
# Resolve the "cmake" meta-target
73+
string(REGEX REPLACE "^cmake" "${Halide_CMAKE_TARGET}" _resolved_target "${Halide_TARGET}")
74+
75+
set_tests_properties(
76+
${TARGET}
77+
PROPERTIES
78+
LABELS "${args_GROUPS}"
79+
ENVIRONMENT "HL_TARGET=${_resolved_target};HL_JIT_TARGET=${_resolved_target}"
80+
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
81+
WILL_FAIL ${args_EXPECT_FAILURE}
82+
)
83+
if ("autoschedulers_cpu" IN_LIST args_GROUPS)
84+
set_tests_properties(${TARGET} PROPERTIES RUN_SERIAL TRUE)
85+
endif ()
7786

7887
if (NOT args_USE_EXIT_CODE_ONLY)
7988
set_tests_properties(${TARGET} PROPERTIES

cmake/toolchain.linux-arm32.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
2424
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
2525
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
2626

27+
set(CMAKE_C_FLAGS_INIT "-mfp16-format=ieee -Wno-psabi")
28+
set(CMAKE_CXX_FLAGS_INIT "-mfp16-format=ieee -Wno-psabi")
29+
2730
# add_custom_command() will make bad decisions about running the command
2831
# when crosscompiling (it won't expand the target into a full path).
2932
# Setting CMAKE_CROSSCOMPILING_EMULATOR to /usr/bin/env tricks it into

pyproject.toml

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ readme = "./packaging/pip/README.md"
1616
requires-python = ">=3.10"
1717
dependencies = [
1818
"imageio>=2",
19+
"pillow; platform_machine == 'armv8l' or platform_machine == 'armv7l'",
1920
"numpy>=1.26",
2021
]
2122
dynamic = ['version']
@@ -38,7 +39,6 @@ classifiers = [
3839
"Environment :: WebAssembly",
3940
"Intended Audience :: Developers",
4041
"Intended Audience :: Science/Research",
41-
"License :: OSI Approved :: MIT License",
4242
"Natural Language :: English",
4343
"Operating System :: MacOS",
4444
"Operating System :: Microsoft :: Windows",
@@ -68,7 +68,8 @@ dev = [
6868
"setuptools-scm>=8.3.1",
6969
]
7070
apps = [
71-
"onnx==1.18.0", # for apps/onnx
71+
"onnx==1.18.0; platform_machine != 'armv8l' and platform_machine != 'armv7l'", # for apps/onnx
72+
"onnx==1.17.0; platform_machine == 'armv8l' or platform_machine == 'armv7l'", # for apps/onnx
7273
"pytest", # unspecified onnx dependency
7374
]
7475
tools = [
@@ -202,8 +203,18 @@ conflicts = [
202203

203204
[tool.uv.sources]
204205
halide-llvm = { index = "halide" }
206+
imageio = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
207+
numpy = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
208+
onnx = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
209+
pillow = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
210+
protobuf = { index = "piwheels", marker = "platform_machine == 'armv8l' or platform_machine == 'armv7l'" }
205211

206212
[[tool.uv.index]]
207213
name = "halide"
208214
url = "https://pypi.halide-lang.org/simple"
209215
explicit = true
216+
217+
[[tool.uv.index]]
218+
name = "piwheels"
219+
url = "https://piwheels.org/simple"
220+
explicit = true

src/CodeGen_ARM.cpp

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ Target complete_arm_target(Target t) {
5656
}
5757
};
5858

59+
// ARMFp16 implies ARMv8.2-A; we don't know of any devices where
60+
// that doesn't hold. The cascade loop below will set ARMv81a and ARMv8a.
61+
add_implied_feature_if_supported(t, Target::ARMFp16, Target::ARMv82a);
62+
5963
constexpr int num_arm_v8_features = 10;
6064
static const Target::Feature arm_v8_features[num_arm_v8_features] = {
6165
Target::ARMv89a,
@@ -1681,6 +1685,7 @@ void CodeGen_ARM::visit(const Store *op) {
16811685
vpred_val = convert_fixed_or_scalable_vector_type(vpred_val, pred_type);
16821686
if (is_predicated_store) {
16831687
Value *sliced_store_vpred_val = slice_vector(store_pred_val, i, natural_lanes);
1688+
sliced_store_vpred_val = convert_fixed_or_scalable_vector_type(sliced_store_vpred_val, pred_type);
16841689
vpred_val = builder->CreateAnd(vpred_val, sliced_store_vpred_val);
16851690
}
16861691

@@ -1854,6 +1859,7 @@ void CodeGen_ARM::visit(const Load *op) {
18541859
Value *vpred_val = codegen(vpred);
18551860
if (is_predicated_load) {
18561861
Value *sliced_load_vpred_val = slice_vector(load_pred_val, i, natural_lanes);
1862+
sliced_load_vpred_val = convert_fixed_or_scalable_vector_type(sliced_load_vpred_val, vpred_val->getType());
18571863
vpred_val = builder->CreateAnd(vpred_val, sliced_load_vpred_val);
18581864
}
18591865

@@ -1904,8 +1910,14 @@ Value *CodeGen_ARM::interleave_vectors(const std::vector<Value *> &vecs) {
19041910
return CodeGen_Posix::interleave_vectors(vecs);
19051911
}
19061912

1907-
// Lower into llvm.vector.interleave intrinsic
1913+
// Lower into llvm.vector.interleave intrinsic.
1914+
// LLVM only supports non-power-of-2 strides (e.g. 3) for scalable
1915+
// vectors starting in LLVM 22.
1916+
#if LLVM_VERSION >= 220
19081917
const std::set<int> supported_strides{2, 3, 4, 8};
1918+
#else
1919+
const std::set<int> supported_strides{2, 4, 8};
1920+
#endif
19091921
const int stride = vecs.size();
19101922
const int src_lanes = get_vector_num_elements(vecs[0]->getType());
19111923

@@ -1957,7 +1969,11 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector<int> &
19571969
}
19581970

19591971
// Lower slice with stride into llvm.vector.deinterleave intrinsic
1972+
#if LLVM_VERSION >= 220
19601973
const std::set<int> supported_strides{2, 3, 4, 8};
1974+
#else
1975+
const std::set<int> supported_strides{2, 4, 8};
1976+
#endif
19611977
if (supported_strides.find(slice_stride) != supported_strides.end() &&
19621978
dst_lanes * slice_stride == src_lanes &&
19631979
indices.front() < slice_stride && // Start position cannot be larger than stride
@@ -2410,6 +2426,10 @@ string CodeGen_ARM::mcpu_target() const {
24102426
if (target.bits == 32) {
24112427
if (target.has_feature(Target::ARMv7s)) {
24122428
return "swift";
2429+
} else if (target.has_feature(Target::ARMv82a)) {
2430+
return "cortex-a55";
2431+
} else if (target.has_feature(Target::ARMv8a)) {
2432+
return "cortex-a32";
24132433
} else {
24142434
return "cortex-a9";
24152435
}
@@ -2436,7 +2456,10 @@ string CodeGen_ARM::mattrs() const {
24362456
attrs.emplace_back("+fullfp16");
24372457
}
24382458
if (target.has_feature(Target::ARMv8a)) {
2439-
attrs.emplace_back("+v8a");
2459+
// The ARM (32-bit) backend calls this feature "v8"; the AArch64
2460+
// backend calls it "v8a". The dotted sub-versions (v8.1a, v8.2a,
2461+
// etc.) use the same names in both backends.
2462+
attrs.emplace_back(target.bits == 32 ? "+v8" : "+v8a");
24402463
}
24412464
if (target.has_feature(Target::ARMv81a)) {
24422465
attrs.emplace_back("+v8.1a");

src/CodeGen_LLVM.cpp

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,6 +1515,17 @@ void CodeGen_LLVM::visit(const Reinterpret *op) {
15151515
llvm::Type *llvm_dst_fixed = get_vector_type(llvm_type_of(dst.element_of()), dst.lanes(), VectorTypeConstraint::Fixed);
15161516
value = builder->CreateBitOrPointerCast(value, llvm_dst_fixed);
15171517
value = fixed_to_scalable_vector_type(value);
1518+
} else if (isa<FixedVectorType>(value->getType()) && isa<ScalableVectorType>(llvm_dst)) {
1519+
// Cannot bitcast/ptrtoint directly between fixed and scalable vectors.
1520+
// First cast to a fixed vector of the destination element type, then convert to scalable.
1521+
llvm::Type *llvm_dst_fixed = get_vector_type(llvm_dst->getScalarType(), dst.lanes(), VectorTypeConstraint::Fixed);
1522+
value = builder->CreateBitOrPointerCast(value, llvm_dst_fixed);
1523+
value = fixed_to_scalable_vector_type(value);
1524+
} else if (isa<ScalableVectorType>(value->getType()) && isa<FixedVectorType>(llvm_dst)) {
1525+
// Cannot bitcast/ptrtoint directly between scalable and fixed vectors.
1526+
// First convert to a fixed vector of the source element type, then cast.
1527+
value = scalable_to_fixed_vector_type(value);
1528+
value = builder->CreateBitOrPointerCast(value, llvm_dst);
15181529
} else {
15191530
// Our `Reinterpret` expr directly maps to LLVM IR bitcast/ptrtoint/inttoptr
15201531
// instructions with no additional handling required:
@@ -4338,10 +4349,12 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
43384349
const int input_lanes = val.type().lanes();
43394350
const int input_bytes = input_lanes * val.type().bytes();
43404351
const int vscale = std::max(effective_vscale, 1);
4352+
// LLVM added VECREDUCE_MUL/FMUL lowering for SVE in LLVM 22.
4353+
const bool mul_ok = LLVM_VERSION >= 220 || effective_vscale == 0;
43414354
const bool llvm_has_intrinsic =
43424355
// Must be one of these ops
43434356
((op->op == VectorReduce::Add ||
4344-
op->op == VectorReduce::Mul ||
4357+
(op->op == VectorReduce::Mul && mul_ok) ||
43454358
op->op == VectorReduce::Min ||
43464359
op->op == VectorReduce::Max) &&
43474360
(use_llvm_vp_intrinsics ||
@@ -4950,6 +4963,13 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
49504963
// otherwise.
49514964
llvm::Type *scalar_type = vec->getType()->getScalarType();
49524965

4966+
if (scalar_type->isIntegerTy(1)) {
4967+
auto *result_type = cast<VectorType>(get_vector_type(scalar_type, size / effective_vscale, VectorTypeConstraint::VScale));
4968+
return handle_bool_as_i8(vec, result_type, [&](Value *v) {
4969+
return slice_vector(v, start, size);
4970+
});
4971+
}
4972+
49534973
int intermediate_lanes = std::min(size, vec_lanes - start);
49544974
llvm::Type *intermediate_type = get_vector_type(scalar_type, intermediate_lanes, VectorTypeConstraint::Fixed);
49554975

@@ -5241,6 +5261,18 @@ llvm::Value *CodeGen_LLVM::match_vector_type_scalable(llvm::Value *value, llvm::
52415261
return match_vector_type_scalable(value, guide->getType());
52425262
}
52435263

5264+
llvm::Value *CodeGen_LLVM::handle_bool_as_i8(llvm::Value *arg, llvm::VectorType *result_i1_type,
5265+
const std::function<llvm::Value *(llvm::Value *)> &fn) {
5266+
auto *arg_vty = cast<llvm::VectorType>(arg->getType());
5267+
bool scalable = isa<llvm::ScalableVectorType>(arg_vty);
5268+
int min_elts = scalable ? cast<llvm::ScalableVectorType>(arg_vty)->getMinNumElements() : cast<llvm::FixedVectorType>(arg_vty)->getNumElements();
5269+
auto constraint = scalable ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed;
5270+
llvm::Type *arg_i8 = get_vector_type(i8_t, min_elts, constraint);
5271+
llvm::Value *widened = builder->CreateZExt(arg, arg_i8);
5272+
llvm::Value *result = fn(widened);
5273+
return builder->CreateTrunc(result, result_i1_type);
5274+
}
5275+
52445276
llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *arg,
52455277
llvm::Type *desired_type) {
52465278
llvm::Type *arg_type = arg->getType();
@@ -5250,6 +5282,18 @@ llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *ar
52505282
}
52515283

52525284
internal_assert(arg_type->getScalarType() == desired_type->getScalarType());
5285+
5286+
if (arg_type->isVectorTy() && desired_type->isVectorTy() &&
5287+
arg_type->getScalarType()->isIntegerTy(1)) {
5288+
bool dst_scalable = isa<llvm::ScalableVectorType>(desired_type);
5289+
int dst_elts = get_vector_num_elements(desired_type);
5290+
llvm::Type *dst_i8 = get_vector_type(i8_t, dst_scalable ? dst_elts / effective_vscale : dst_elts,
5291+
dst_scalable ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed);
5292+
return handle_bool_as_i8(arg, cast<VectorType>(desired_type), [&](Value *v) {
5293+
return convert_fixed_or_scalable_vector_type(v, dst_i8);
5294+
});
5295+
}
5296+
52535297
if (!arg_type->isVectorTy()) {
52545298
arg = create_broadcast(arg, 1);
52555299
arg_type = arg->getType();
@@ -5331,6 +5375,12 @@ llvm::Value *CodeGen_LLVM::fixed_to_scalable_vector_type(llvm::Value *fixed_arg)
53315375
internal_assert(fixed_type->getElementType() == scalable_type->getElementType());
53325376
internal_assert(lanes == (scalable_type->getMinNumElements() * effective_vscale));
53335377

5378+
if (fixed_type->getElementType()->isIntegerTy(1)) {
5379+
return handle_bool_as_i8(fixed_arg, scalable_type, [&](Value *v) {
5380+
return fixed_to_scalable_vector_type(v);
5381+
});
5382+
}
5383+
53345384
// E.g. <vscale x 2 x i64> llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
53355385
const char *type_designator;
53365386
if (fixed_type->getElementType()->isIntegerTy()) {
@@ -5348,7 +5398,7 @@ llvm::Value *CodeGen_LLVM::fixed_to_scalable_vector_type(llvm::Value *fixed_arg)
53485398

53495399
std::vector<llvm::Value *> args;
53505400
args.push_back(result_vec);
5351-
args.push_back(value);
5401+
args.push_back(fixed_arg);
53525402
args.push_back(ConstantInt::get(i64_t, 0));
53535403

53545404
return simple_call_intrin(intrin, args, scalable_type);
@@ -5367,6 +5417,12 @@ llvm::Value *CodeGen_LLVM::scalable_to_fixed_vector_type(llvm::Value *scalable_a
53675417
internal_assert(fixed_type->getElementType() == scalable_type->getElementType());
53685418
internal_assert(fixed_type->getNumElements() == (scalable_type->getMinNumElements() * effective_vscale));
53695419

5420+
if (scalable_type->getElementType()->isIntegerTy(1)) {
5421+
return handle_bool_as_i8(scalable_arg, fixed_type, [&](Value *v) {
5422+
return scalable_to_fixed_vector_type(v);
5423+
});
5424+
}
5425+
53705426
// E.g. <64 x i8> @llvm.vector.extract.v64i8.nxv8i8(<vscale x 8 x i8> %vresult, i64 0)
53715427
const char *type_designator;
53725428
if (scalable_type->getElementType()->isIntegerTy()) {

0 commit comments

Comments
 (0)