llvm/llvm-project
View on GitHubImproving performance of 2xi64 icmp slt/sgt/ult/ugt on SSE2 using 2xi64 sub
Open
#174214 opened on Jan 2, 2026
backend:X86good first issuemissed-optimization
Description
Here are links to LLVM IR snippets demonstrating the two alternatives for 2xi64 icmp sgt on SSE2:
Here is a more efficient implementation (at least according to llvm-mca) of 2xi64 icmp slt on SSE2:
SSE2_2xI64_CompareGt_2: # @SSE2_2xI64_CompareGt_2
movdqa xmm2, xmm0
pcmpgtd xmm2, xmm1
# xmm2 == {(int32_t)a[0] > (int32_t)b[0],
# (int32_t)(a[0] >> 32) > (int32_t)(b[0] >> 32),
# (int32_t)a[1] > (int32_t)b[1],
# (int32_t)(a[1] >> 32) > (int32_t)(b[1] >> 32)}
# (as 4xi32 vector)
movdqa xmm3, xmm0
pcmpeqd xmm3, xmm1
# xmm3 == {(int32_t)a[0] == (int32_t)b[0],
# (int32_t)(a[0] >> 32) == (int32_t)(b[0] >> 32),
# (int32_t)a[1] == (int32_t)b[1],
# (int32_t)(a[1] >> 32) == (int32_t)(b[1] >> 32)}
# (as 4xi32 vector)
psubq xmm1, xmm0
# xmm2 == {b[0] - a[0], b[1] - a[1]} (as 2xi64 vector)
# If (a[0] >> 32) == (b[0] >> 32) && a[0] <= b[0] is true, then
# (xmm1[0] >> 32) == 0 will be true (if a, b, and xmm1 are treated as
# 2xi64 vectors).
# If (a[0] >> 32) == (b[0] >> 32) && a[0] > b[0] is true, then
# (xmm1[0] >> 32) == -1 will be true (if a, b, and xmm1 are treated as
# 2xi64 vectors).
pand xmm1, xmm3
# If xmm1, a, and b are treated as 2xi64 vectors:
# (xmm1[0] >> 32) == (((a[0] >> 32) == (b[0] >> 32) && a[0] > b[0]) ?
# -1 : 0)
# (xmm1[1] >> 32) == (((a[1] >> 32) == (b[1] >> 32) && a[1] > b[1]) ?
# -1 : 0)
# (xmm2[0] >> 32) == (((a[0] >> 32) > (b[0] >> 32)) ? -1 : 0)
# (xmm2[1] >> 32) == (((a[1] >> 32) > (b[1] >> 32)) ? -1 : 0)
por xmm1, xmm2
# If xmm1, a, and b are treated as 2xi64 vectors:
# (xmm1[0] >> 32) == ((a[0] > b[0]) ? -1 : 0)
# (xmm1[1] >> 32) == ((a[1] > b[1]) ? -1 : 0)
pshufd xmm0, xmm1, 245 # xmm0 = xmm1[1,1,3,3]
# xmm0 == { a[0] > b[0] ? -1 : 0, a[1] > b[1] ? -1 : 0 }
# (as 2xi64 vector)
ret
If (a >> 32) == (b >> 32) is true, then -2**32 + 1 <= b - a <= 2**32 - 1 will be true, making (b - a) >> 32 equal to either -1 or 0 if (a >> 32) is equal to (b >> 32) (with right shifts being arithmetic right shifts and a and b being 64-bit signed integers).