[BUG] Benchmark random data generator produces skewed distributions and invalid floating-point values
#18,315 创建于 2025年3月18日
描述
Describe the bug
While working on #18290, I discovered that libcudf benchmarks fail to reproduce the expected performance improvements for very low-cardinality groupby cases observed by Devtech. Upon further investigation, it became evident that the random data generator used in benchmarks is not functioning correctly.
To verify this, I ran a small test by generating a 100-element int32_t column while varying the cardinality from 1 to 32. The output from cudf::test::print revealed a clear issue:
[cardinality=1]
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
[cardinality=2]
0,0,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,0,0,8,8,8,8,8,0,0,0,0,0,0,0,8,8,8,8,8,8,0,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
[cardinality=4]
0,0,60,60,60,60,60,90,90,90,90,90,90,90,90,90,90,90,90,90,90,90,90,0,0,60,60,60,60,60,8,8,8,8,8,8,8,60,60,60,60,60,60,0,60,60,60,60,60,60,60,60,60,60,90,90,90,90,90,90,90,60,60,60,60,60,60,60,60,60,60,90,90,90,90,90,90,90,90,90,90,90,90,90,90,90,90,60,60,60,60,60,60,90,90,90,90,90,90,90
[cardinality=8]
0,0,97,97,97,97,97,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,8,8,97,97,97,97,97,90,90,90,90,60,60,60,19,19,19,19,19,19,0,97,97,97,97,97,97,97,97,97,97,52,52,52,52,52,52,52,97,97,97,97,97,97,97,97,97,97,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,19,19,19,19,19,19,40,40,40,40,40,40,40
[cardinality=16]
0,8,75,75,75,75,75,59,59,59,59,59,59,59,59,51,51,51,51,51,51,51,51,90,90,26,26,26,26,26,52,52,52,52,97,97,97,56,56,56,56,56,56,8,26,26,26,26,26,75,75,75,75,75,58,58,58,58,58,58,58,75,75,75,75,75,26,26,26,26,26,59,59,59,59,59,59,59,59,51,51,51,51,51,51,51,51,56,56,56,56,56,56,51,51,51,51,51,51,51
[cardinality=32]
0,60,97,97,97,97,97,91,91,91,91,91,91,91,91,94,94,94,94,94,94,94,94,52,52,88,88,88,88,88,58,58,58,58,26,26,26,65,65,65,65,65,65,60,100,100,100,100,100,73,73,73,73,73,97,97,97,97,97,97,97,73,73,73,73,73,88,88,88,88,88,91,91,91,91,91,91,91,91,41,41,41,41,41,41,41,41,65,65,65,65,65,65,94,94,94,94,94,94,94
- When cardinality = 1, the results are as expected, with all values being the same.
- However, when cardinality = 2, the distribution is highly unbalanced, e.g. 0 appears 13 times, while 8 appears 83 times instead of being evenly split (~50 each).
- This pattern of skewed distributions persists across other cardinalities, as demonstrated in the test results.
Such an imbalance significantly impacts benchmarking accuracy. Ideally, a uniform distribution would ensure that values are evenly represented, but the flawed data generator introduces biases that distort performance measurements. This affects almost all of our cardinality benchmarks.
Another issue lies with the floating-point generator. When generating 100 double elements within the range [0, 100], all values ended up being very close to either the max or the min value of double. This clearly indicates a serious problem with the generator, as the output is nowhere near the expected range.
-8.9884656743115785e+307,-1.900795218050189e+307,5.8237058299790331e+306,5.8237058299790331e+306,
5.8237058299790331e+306,5.8237058299790331e+306,5.8237058299790331e+306,-1.9738458611542523e+307,
-1.9738458611542523e+307,-1.9738458611542523e+307,-1.9738458611542523e+307,-1.9738458611542523e+307,
-1.9738458611542523e+307,-1.9738458611542523e+307,-1.9738458611542523e+307,-7.8726545643607571e+306,
-7.8726545643607571e+306,-7.8726545643607571e+306,-7.8726545643607571e+306,-7.8726545643607571e+306,
-7.8726545643607571e+306,-7.8726545643607571e+306,-7.8726545643607571e+306,1.8018745534686453e+307,
1.8018745534686453e+307,-2.727274834075473e+307,-2.727274834075473e+307,-2.727274834075473e+307,
-2.727274834075473e+307,-2.727274834075473e+307,-3.9511159250950192e+307,-3.9511159250950192e+307,
-3.9511159250950192e+307,-3.9511159250950192e+307,-2.9220632163786983e+307,-2.9220632163786983e+307,
-2.9220632163786983e+307,-3.9550326807426145e+307,-3.9550326807426145e+307,-3.9550326807426145e+307,
-3.9550326807426145e+307,-3.9550326807426145e+307,-3.9550326807426145e+307,-1.900795218050189e+307,
-4.6111471708172531e+306,-4.6111471708172531e+306,-4.6111471708172531e+306,-4.6111471708172531e+306,
-4.6111471708172531e+306,3.996709520468354e+307,3.996709520468354e+307,3.996709520468354e+307,
3.996709520468354e+307,3.996709520468354e+307,5.1955152235197585e+307,5.1955152235197585e+307,
5.1955152235197585e+307,5.1955152235197585e+307,5.1955152235197585e+307,5.1955152235197585e+307,
5.1955152235197585e+307,7.2599567509417792e+307,7.2599567509417792e+307,7.2599567509417792e+307,
7.2599567509417792e+307,7.2599567509417792e+307,-2.727274834075473e+307,-2.727274834075473e+307,
-2.727274834075473e+307,-2.727274834075473e+307,-2.727274834075473e+307,6.640906131316438e+306,
6.640906131316438e+306,6.640906131316438e+306,6.640906131316438e+306,6.640906131316438e+306,
6.640906131316438e+306,6.640906131316438e+306,6.640906131316438e+306,1.0145923254390491e+307,
1.0145923254390491e+307,1.0145923254390491e+307,1.0145923254390491e+307,1.0145923254390491e+307,
1.0145923254390491e+307,1.0145923254390491e+307,1.0145923254390491e+307,5.9736272051675239e+306,
5.9736272051675239e+306,5.9736272051675239e+306,5.9736272051675239e+306,5.9736272051675239e+306,
5.9736272051675239e+306,-7.8726545643607571e+306,-7.8726545643607571e+306,-7.8726545643607571e+306,
-7.8726545643607571e+306,-7.8726545643607571e+306,-7.8726545643607571e+306,-7.8726545643607571e+306
Steps/Code to reproduce bug
auto constexper num_rows = 100;
auto constexper cardinality = 1; // [2, 4, 8, 16, 32]
auto const keys = [&] {
data_profile const profile =
data_profile_builder()
.cardinality(cardinality)
.no_validity()
.distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
}();
auto const vals = [&] {
auto builder = data_profile_builder().cardinality(0).no_validity().distribution(
cudf::type_to_id<double>(), distribution_id::UNIFORM, 0, num_rows);
return create_random_column(
cudf::type_to_id<double>(), row_count{num_rows}, data_profile{builder});
}();
cudf::test::print(keys->view());
cudf::test::print(vals->view());
Expected behavior
- With normal distribution, distinct elements should be evenly distributed across the range without obvious skew.
- Floating-point elements should be generated within the specified range.