area-CodeGen-coreclrhelp wantedtenet-performance
描述
Description
(Assuming x is unsigned...)
A somewhat common pattern to mask off some least-significant bits is to do (x >> N) << N. However, CoreCLR lowers this exactly as written instead of a slightly more performant mask: x & ~((1 << N) - 1).
Assembly
https://godbolt.org/z/5zsWaes4q
internal static class Klass
{
public static uint Test(uint x)
{
return (x >> 5) << 5;
}
public static uint Test2(uint x)
{
return (x / 32) * 32;
}
public static uint Test3(uint x)
{
return x & ~31u;
}
}
Klass:Test(uint):uint (FullOpts):
mov eax, ecx
shr eax, 5
shl eax, 5
ret
Klass:Test2(uint):uint (FullOpts):
mov eax, ecx
shr eax, 5
shl eax, 5
ret
Klass:Test3(uint):uint (FullOpts):
mov eax, ecx
and eax, -32
ret
Benchmark
public class Benchmarks
{
static Benchmarks()
{
Random r = new(42);
uint[] tests = new uint[100];
r.GetItems(tests, 100);
Tests = tests;
}
private static readonly uint[] Tests;
[Benchmark]
public uint TestShift()
{
uint[] tests = Tests;
uint accum = 0;
for (int i = 0; i < tests.Length; i++)
accum += (tests[i] >> 5) << 5;
return accum;
}
[Benchmark]
public uint TestMask()
{
uint[] tests = Tests;
uint accum = 0;
for (int i = 0; i < tests.Length; i++)
accum += tests[i] & ~31u;
return accum;
}
}
BenchmarkDotNet v0.15.2, Windows 11 (10.0.26100.4770/24H2/2024Update/HudsonValley)
Intel Core i9-10900X CPU 3.70GHz, 1 CPU, 20 logical and 10 physical cores
.NET SDK 9.0.304
[Host] : .NET 8.0.19 (8.0.1925.36514), X64 RyuJIT AVX-512F+CD+BW+DQ+VL
DefaultJob : .NET 8.0.19 (8.0.1925.36514), X64 RyuJIT AVX-512F+CD+BW+DQ+VL
| Method | Mean | Error | StdDev | Code Size | Allocated |
|---------- |---------:|---------:|---------:|----------:|----------:|
| TestShift | 51.54 ns | 0.456 ns | 0.426 ns | 61 B | - |
| TestMask | 42.26 ns | 0.344 ns | 0.322 ns | 57 B | - |