Is this a duplicate?
Type of Bug
Silent Failure
Component
libcu++
Describe the bug
cuda::atomic_ref<unsigned __int128>::compare_exchange_strong returns true but writes garbled data to memory instead of the desired value. The root cause is here
|
mov.b128 _d, {{%0, %1}}; |
|
mov.b128 _v, {{%4, %5}}; |
the two mov.b128 instructions load from the wrong inline asm operands. The first reads from the output registers (undefined before the asm block) instead of the compare inputs, and the second mixes half of the expected value with half of the desired value. All 20 atom.cas.*.b128 variants (5 memory orders x 4 scopes) are affected.
The fix is a two-character change in the codegen template plus regenerating cuda_ptx_generated.h:
--- a/libcudacxx/codegen/generators/compare_and_swap.h
+++ b/libcudacxx/codegen/generators/compare_and_swap.h
@@ -72,8 +72,8 @@
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {{%0, %1}};
- mov.b128 _v, {{%4, %5}};
+ mov.b128 _d, {{%3, %4}};
+ mov.b128 _v, {{%5, %6}};
atom.cas{3}{5}.b128 _d,[%2],_d,_v;
mov.b128 {{%0, %1}}, _d;
How to Reproduce
Standalone reproducer (requires sm_90+):
#include <cstdio>
#include <cstdint>
#include <cuda/atomic>
using u128 = unsigned __int128;
__global__ void test_atomic_ref_cas(u128* slot, bool* cas_ok, u128* readback)
{
u128 expected = ~u128(0);
u128 desired = (u128(7) << 64) | 42;
cuda::atomic_ref<u128, cuda::thread_scope_device> ref{*slot};
*cas_ok = ref.compare_exchange_strong(expected, desired, cuda::memory_order_seq_cst);
*readback = ref.load(cuda::memory_order_seq_cst);
}
int main()
{
u128* d_slot;
bool* h_ok;
u128* h_rb;
cudaMalloc(&d_slot, sizeof(u128));
cudaMallocHost(&h_ok, sizeof(bool));
cudaMallocHost(&h_rb, sizeof(u128));
u128 sentinel = ~u128(0);
cudaMemcpy(d_slot, &sentinel, sizeof(u128), cudaMemcpyHostToDevice);
test_atomic_ref_cas<<<1, 1>>>(d_slot, h_ok, h_rb);
cudaDeviceSynchronize();
if (cudaGetLastError() != cudaSuccess) {
printf("CUDA error: %s\n", cudaGetErrorString(cudaGetLastError()));
return 1;
}
auto lo = [](u128 v) { return static_cast<uint64_t>(v); };
auto hi = [](u128 v) { return static_cast<uint64_t>(v >> 64); };
u128 expected_val = (u128(7) << 64) | 42;
printf("compare_exchange_strong returned: %s\n", *h_ok ? "true" : "false");
printf("slot after CAS: lo=0x%016lx hi=0x%016lx\n", lo(*h_rb), hi(*h_rb));
printf("expected result: lo=0x%016lx hi=0x%016lx\n", lo(expected_val), hi(expected_val));
bool pass = *h_ok && (*h_rb == expected_val);
printf("Result: %s\n", pass ? "PASS" : "FAIL");
cudaFree(d_slot);
cudaFreeHost(h_ok);
cudaFreeHost(h_rb);
return pass ? 0 : 1;
}
Compile and run:
nvcc -std=c++17 -arch=sm_90 --expt-extended-lambda -I libcudacxx/include -o repro_128b_cas repro_128b_cas.cu && ./repro_128b_cas
Output:
compare_exchange_strong returned: true
slot after CAS: lo=0xffffffffffffffff hi=0xffffffffffffffff
expected result: lo=0x000000000000002a hi=0x0000000000000007
Result: FAIL
The existing lit test (16b_integral_ref.pass.cpp) does catch this bug, but only when configured with CMAKE_CUDA_ARCHITECTURES=90 or higher. In the default CI configuration, the test compiles for multiple architectures including sub-sm_90 targets, which causes lit to add pre-sm-90 to available features. Since the test is marked UNSUPPORTED: pre-sm-90, it gets skipped entirely.
|
// UNSUPPORTED: pre-sm-90 |
ci/util/build_and_test_targets.sh --preset libcudacxx --cmake-options "-DCMAKE_CUDA_ARCHITECTURES=90" --lit-precompile-tests "std/atomics/atomics.types.generic/integral/16b_integral_ref.pass.cpp" --lit-tests "std/atomics/atomics.types.generic/integral/16b_integral_ref.pass.cpp"
...
-- Testing: 1 tests, 1 workers --
FAIL: libcu++ :: std/atomics/atomics.types.generic/integral/16b_integral_ref.pass.cpp (1 of 1)
******************** TEST 'libcu++ :: std/atomics/atomics.types.generic/integral/16b_integral_ref.pass.cpp' FAILED ********************
Compiled With: '/usr/bin/sccache /usr/local/cuda/bin/nvcc -o /home/scratch.djuenger_gpu_1/projects/cccl/build/libcudacxx/libcudacxx/test/libcudacxx/test/std/atomics/atomics.types.generic/integral/Output/16b_integral_ref.pass.cpp.o -x cu /home/scratch.djuenger_gpu_1/projects/cccl/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/16b_integral_ref.pass.cpp -c -std=c++17 -ftemplate-depth=270 -ccbin=/usr/bin/g++ -include /home/scratch.djuenger_gpu_1/projects/cccl/libcudacxx/test/support/nasty_macros.h -I/home/scratch.djuenger_gpu_1/projects/cccl/libcudacxx/include -I/home/scratch.djuenger_gpu_1/projects/cccl/libcudacxx/../thrust/ -I/home/scratch.djuenger_gpu_1/projects/cccl/libcudacxx/../cub/ -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -Xcompiler -fno-rtti -I/home/scratch.djuenger_gpu_1/projects/cccl/libcudacxx/test/support -D_CCCL_NO_SYSTEM_HEADER -DCCCL_ENABLE_ASSERTIONS -I /home/scratch.djuenger_sw/.cache/cpm/dlpack/4e3b45e89c3a707d3546f7893637c9e7d9f33e7c/include -DCCCL_ENABLE_OPTIONAL_REF -DCCCL_IGNORE_DEPRECATED_CPP_DIALECT -DLIBCUDACXX_IGNORE_DEPRECATED_ABI -include /home/scratch.djuenger_gpu_1/projects/cccl/libcudacxx/test/libcudacxx/force_include.h --compiler-options=-Wall --compiler-options=-Wextra -Wno-deprecated-gpu-targets --extended-lambda -gencode=arch=compute_90,code=sm_90 -Xcudafe --display_error_number -Werror=all-warnings -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Werror -Xcompiler -Wno-literal-suffix -Xcompiler -Wno-unused-parameter -Xcompiler -Wno-unused-local-typedefs -Xcompiler -Wno-deprecated-declarations -Xcompiler -Wno-noexcept-type -Xcompiler -Wno-unused-function -D_LIBCUDACXX_DISABLE_PRAGMA_GCC_SYSTEM_HEADER -c && /usr/local/cuda/bin/nvcc -o /home/scratch.djuenger_gpu_1/projects/cccl/build/libcudacxx/libcudacxx/test/libcudacxx/test/std/atomics/atomics.types.generic/integral/Output/16b_integral_ref.pass.cpp.exe /home/scratch.djuenger_gpu_1/projects/cccl/build/libcudacxx/libcudacxx/test/libcudacxx/test/std/atomics/atomics.types.generic/integral/Output/16b_integral_ref.pass.cpp.o -std=c++17 -ftemplate-depth=270 -ccbin=/usr/bin/g++ -latomic -lm -lgcc_s -lgcc -lpthread -lrt -lc -lgcc_s -lgcc'
Command: "/home/scratch.djuenger_gpu_1/projects/cccl/build/libcudacxx/libcudacxx/test/libcudacxx/test/std/atomics/atomics.types.generic/integral/Output/16b_integral_ref.pass.cpp.exe"
Exit Code: 1
Standard Output:
--
Testing on host:
Testing on device:
CUDA devices found: 1
Device 0: "NVIDIA H100 80GB HBM3", Selected, SM90, 85019590656 [bytes]
CUDA ERROR, line 114: cudaErrorAssert: device-side assert triggered
--
Standard Error:
--
/home/scratch.djuenger_gpu_1/projects/cccl/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/16b_integral_ref.pass.cpp:126: void do_test() [with A = cuda::std::__4::atomic_ref<__int128>; T = __int128; Selector = shared_memory_selector]: block: [0,0,0], thread: [0,0,0] Assertion `obj == T(2)` failed.
--
Compiled test failed unexpectedly!
********************
********************
Failed Tests (1):
libcu++ :: std/atomics/atomics.types.generic/integral/16b_integral_ref.pass.cpp
Testing Time: 0.48s
Total Discovered Tests: 1
Failed: 1 (100.00%)
5 warning(s) in tests
::endgroup::
🔴🧪 LIT test failed (1m21s): std/atomics/atomics.types.generic/integral/16b_integral_ref.pass.cpp
Expected behavior
Abovementioned reproducer should show the following output:
compare_exchange_strong returned: true
slot after CAS: lo=0x000000000000002a hi=0x0000000000000007
expected result: lo=0x000000000000002a hi=0x0000000000000007
Result: PASS
Reproduction link
No response
Operating System
Ubuntu 24.04
nvidia-smi output
nvidia-smi
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 595.58.03 Driver Version: 595.58.03 CUDA Version: 13.2 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA H100 80GB HBM3 On | 00000000:6B:00.0 Off | 0 |
| N/A 42C P0 66W / 700W | 4MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
NVCC version
nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Tue_Dec_16_07:23:41_PM_PST_2025
Cuda compilation tools, release 13.1, V13.1.115
Build cuda_13.1.r13.1/compiler.37061995_0
Is this a duplicate?
Type of Bug
Silent Failure
Component
libcu++
Describe the bug
cuda::atomic_ref<unsigned __int128>::compare_exchange_strongreturnstruebut writes garbled data to memory instead of the desired value. The root cause is herecccl/libcudacxx/codegen/generators/compare_and_swap.h
Lines 75 to 76 in 2d263e1
the two
mov.b128instructions load from the wrong inline asm operands. The first reads from the output registers (undefined before the asm block) instead of the compare inputs, and the second mixes half of the expected value with half of the desired value. All 20atom.cas.*.b128variants (5 memory orders x 4 scopes) are affected.The fix is a two-character change in the codegen template plus regenerating
cuda_ptx_generated.h:How to Reproduce
Standalone reproducer (requires
sm_90+):Compile and run:
nvcc -std=c++17 -arch=sm_90 --expt-extended-lambda -I libcudacxx/include -o repro_128b_cas repro_128b_cas.cu && ./repro_128b_casOutput:
compare_exchange_strong returned: true slot after CAS: lo=0xffffffffffffffff hi=0xffffffffffffffff expected result: lo=0x000000000000002a hi=0x0000000000000007 Result: FAILThe existing
littest (16b_integral_ref.pass.cpp) does catch this bug, but only when configured withCMAKE_CUDA_ARCHITECTURES=90or higher. In the default CI configuration, the test compiles for multiple architectures including sub-sm_90 targets, which causeslitto addpre-sm-90to available features. Since the test is markedUNSUPPORTED: pre-sm-90, it gets skipped entirely.cccl/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral/16b_integral_ref.pass.cpp
Line 9 in 2d263e1
Expected behavior
Abovementioned reproducer should show the following output:
compare_exchange_strong returned: true slot after CAS: lo=0x000000000000002a hi=0x0000000000000007 expected result: lo=0x000000000000002a hi=0x0000000000000007 Result: PASSReproduction link
No response
Operating System
Ubuntu 24.04
nvidia-smi output
NVCC version