-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcdf.cuh
424 lines (356 loc) · 12.7 KB
/
cdf.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#pragma once
#include <cuda.h>
#include <stdint.h>
// https://maxliani.wordpress.com/2024/03/09/about-fast-2d-cdf-construction/
#if defined(__CUDA_ARCH__)
#define COMPILING_DEVICE_CODE 1
#else
#define COMPILING_DEVICE_CODE 0
#endif
#define PI 3.14159265359f
inline __device__ float4 & operator+=(float4 & a, const float4& b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
inline __device__ float4 & operator+=(float4 & a, float k) { a.x += k; a.y += k; a.z += k; a.w += k; return a; }
inline __device__ float4 operator*(const float4& a, float k) { return make_float4(a.x * k, a.y * k, a.z * k, a.w * k); }
inline __device__ float4 & operator*=(float4 & a, float k) { a.x *= k; a.y *= k; a.z *= k; a.w *= k; return a; }
inline __device__ __host__ int divRoundUp(int x, int d)
{
return (x + d - 1) / d;
}
template<typename Tn, typename T> __device__ inline T GetLastFieldOf(const Tn& val);
template<> __device__
inline float GetLastFieldOf<float, float>(const float& val)
{
return val;
}
template<> __device__
inline float GetLastFieldOf<float4, float>(const float4& val)
{
return val.w;
}
template<typename Tn> __device__ inline void PrefixSumComponents(Tn& val);
template<> __device__
inline void PrefixSumComponents<float>(float& val)
{}
template<> __device__
inline void PrefixSumComponents<float4>(float4& val)
{
val.y += val.x;
val.z += val.y;
val.w += val.z;
}
template<typename T>
struct PrefixSumInclusive
{
__device__
PrefixSumInclusive(const T&) {}
__device__
void operator()(T&) const
{
// The algorithm natively compute the inclusive sum. Nothing to do here.
}
};
template<typename T>
struct PrefixSumExclusive
{
const T originalValue;
__device__
PrefixSumExclusive(const T& value) { originalValue = value; }
__device__
void operator()(T& value) const
{
// To obtain the exclusive prefix-sum from the inclusive one, we subtract the input value
value -= originalValue;
}
};
// Compute the parallel prefix-sum of a thread block
template<typename ScanMode, typename Tn, typename T> __device__
inline void PrefixSumBlock(Tn& val, volatile T* smem, T& total)
{
const ScanMode scanMode(val);
// Prepare the prefix sum within the n components in Tn. This let us compute the Hillis-Steele algorithm
// only on the Nth elements and reduce the total amount of operations in shared memory.
PrefixSumComponents(val);
for (int i = 1; i < blockDim.x; i <<= 1)
{
smem[threadIdx.x] = GetLastFieldOf<Tn, T>(val);
__syncthreads();
// Add the value to the local variable
if (threadIdx.x >= i)
{
const T offset = smem[threadIdx.x - i];
val += offset;
}
__syncthreads();
}
// Apply prefix sum running total from the previous block
val += total;
__syncthreads();
// Update the prefix sum running total using the last thread in the block
if (threadIdx.x == blockDim.x - 1)
{
total = GetLastFieldOf<Tn, T>(val);
}
__syncthreads();
// Apply the final transformation (i.e. subtract the initial value for exclusive prefix sums)
scanMode(val);
}
// Example Remaps /////////////////////////////////////////////////
// No remap, this works for light maps for rect area lights.
struct Cdf2dIdentity
{
__device__
static void factorX(int index, int w, float4& val)
{}
__device__
static void factorY(int index, int h, float4& val)
{}
};
// The Jacobian for the change of variables from cartesian to spherical coordinates. This is used
// for dome lights latlong maps. No scaling is required within the rows, we only weight the
// rows themselves.
struct Cdf2dSphericalCoordsJacobian
{
__device__
static void factorX(int index, int w, float4& val)
{}
__device__
static void factorY(int index, int h, float4& val)
{
// Each index represents 4 latitudes in spherical coordinates. The desired angles
// are that at the center of those 4 pixels:
//
// Index: 0 -> .---.
// | x | <- +0.125
// |---|
// | y | <- +0.375
// |---| index + offset
// | z | <- +0.635 pi * ----------------
// |---| h
// | w | <- +0.875
// 1 -> |---|
// | x | <- +0.125
// ...
val.x *= sinf(float(PI) * (float(index) + 0.125f) / float(h));
val.y *= sinf(float(PI) * (float(index) + 0.375f) / float(h));
val.z *= sinf(float(PI) * (float(index) + 0.625f) / float(h));
val.w *= sinf(float(PI) * (float(index) + 0.875f) / float(h));
}
};
__device__ inline float luma(float r, float g, float b)
{
return r * 0.2126f + g * 0.7152f + b * 0.0722f;
}
__device__ inline float luma(float3 rgb)
{
return luma(rgb.x, rgb.y, rgb.z);
}
// Example Loaders /////////////////////////////////////////////////
struct CdfLoaderRgb
{
__device__
static void load(const void* input, uint32_t index, float4& val)
{
// Load 4 rgb texels in RGB AOS. This emits 3 128 bytes load.
const float4 a = ((float4*)input)[index * 3 + 0]; // R
const float4 b = ((float4*)input)[index * 3 + 1]; // G
const float4 c = ((float4*)input)[index * 3 + 2]; // B
// Shuffle the RGB values from AOS to 4-wide SOA, and
// apply a clamp since negative values are not allowed.
val.x = max(0.0f, luma(a.x, a.y, a.z));
val.y = max(0.0f, luma(a.w, b.x, b.y));
val.z = max(0.0f, luma(b.z, b.w, c.x));
val.w = max(0.0f, luma(c.y, c.z, c.w));
}
};
using rgb64_t = uint64_t;
__device__ __host__
inline void rgb64_pack(float r, float g, float b, rgb64_t& rgb)
{
constexpr int kBits = 21;
constexpr int kShift = 32 - kBits;
#if COMPILING_DEVICE_CODE
uint64_t ri = __float_as_uint(r) >> kShift;
uint64_t gi = __float_as_uint(g) >> kShift;
uint64_t bi = __float_as_uint(b) >> kShift;
#else
uint64_t ri = ((uint32_t&)r) >> kShift;
uint64_t gi = ((uint32_t&)g) >> kShift;
uint64_t bi = ((uint32_t&)b) >> kShift;
#endif
rgb = ri | (gi << kBits) | (bi << (kBits * 2));
}
__device__ __host__
inline void rgb64_unpack(rgb64_t rgb, float& r, float& g, float& b)
{
constexpr int kBits = 21;
constexpr int kShift = 32 - kBits;
constexpr uint32_t kMask = (1u << kBits) - 1;
#if COMPILING_DEVICE_CODE
r = __uint_as_float(((rgb ) & kMask) << kShift);
g = __uint_as_float(((rgb >> kBits ) & kMask) << kShift);
b = __uint_as_float(((rgb >> (kBits*2)) & kMask) << kShift);
#else
uint32_t ri = ((rgb ) & kMask) << kShift;
uint32_t gi = ((rgb >> kBits ) & kMask) << kShift;
uint32_t bi = ((rgb >> (kBits*2)) & kMask) << kShift;
r = (float&)ri;
g = (float&)gi;
b = (float&)bi;
#endif
}
// Example of a 64 bits packed RGB loader
struct CdfLoaderRgb64
{
__device__
static void load(const void* input, uint32_t index, float4& val)
{
struct __builtin_align__(32) Loader
{
rgb64_t a, b, c, d;
};
// Load 4 packed RGB values using 2 128-bits load instructions.
Loader load4 = ((Loader*)input)[index];
// Shuffle/unpack from AOS to 4-wide SOA
float3 a, b, c, d;
rgb64_unpack(load4.a, a.x, a.y, a.z);
rgb64_unpack(load4.b, b.x, b.y, b.z);
rgb64_unpack(load4.c, c.x, c.y, c.z);
rgb64_unpack(load4.d, d.x, d.y, d.z);
// Clamp negative values
val.x = max(0.0f, luma(a));
val.y = max(0.0f, luma(b));
val.z = max(0.0f, luma(c));
val.w = max(0.0f, luma(d));
}
};
// Create a table to sample from a discrete 2d distribution, such as that of a hdr light map.
// @param w, h, the texture width and height
// @param input, the input texture data
// @param cdf_x, the buffer where to store the marginal CDFs, the buffer size is
// expected to hold as many elements as there are texels in the texture.
// @param cdf_y, the buffer where to store the marginal CDFs, the buffer size is
// expected to hold as many elements as there are rows in the texture.
template<typename ScanMode, typename Loader, typename Remap, int N > __global__
void makeCdf2d_kernel(int w, int h, const void* input, float* cdf_x, float* cdf_y, int* counter)
{
const int index_y = blockIdx.y;
// Init the CDF running total in shared memory. This is to carry forward to prefix sum
// across the loop iterations and store the row final sum at the end.
__shared__ float total;
if (threadIdx.x == 0)
{
total = 0;
}
__syncthreads();
// Memory for the block-wise prefix scan. The size for it is specified in the kernels launch params.
extern __shared__ float smemf[];
// Step 1: compute the marginal sampling cumulative distribution over the rows in the image.
{
// Each block produces the CDF of a entire row and stores it into registers in valN.
// This assumes 4 elements per thread and blocks of 256, 1024 elements per iteration.
// By doing this we load the values once, compute the CDF, normalize it and store the
// result. 1 read, one write.
// Don't consider this as a vector. The actual memory layout for this is:
// - 4 consecutive elements per thread as the x, y, z, w components of the float4
// - spread throught the number of threads per block across the SMs,
// - times the number of iterations N.
float4 valN[N] = {};
constexpr int kNumFields = sizeof(float4) / sizeof(float);
const int numElements = divRoundUp(w, kNumFields);
// Consume a row, produce the prefix sum.
#pragma unroll
for (int blockN = 0, index_x = (threadIdx.x + blockDim.x * blockIdx.x); blockN < N;
++blockN, index_x += blockDim.x)
{
float4 val = make_float4(0, 0, 0, 0);
bool inRange = (index_x < numElements);
if (inRange)
{
// Load 4 texels
Loader::load(input, index_x + index_y * numElements, val);
Remap::factorX(index_x, numElements, val);
}
//// BEGIN TMP DEV
//cdf_x[(index_x + index_y * numElements) * 4 + 0] = val.x;
//cdf_x[(index_x + index_y * numElements) * 4 + 1] = val.y;
//cdf_x[(index_x + index_y * numElements) * 4 + 2] = val.z;
//cdf_x[(index_x + index_y * numElements) * 4 + 3] = val.w;
//((float4*)cdf_x)[index_x + index_y * numElements] = val;
//// END TMP DEV
// Block-wise prefix scan
PrefixSumBlock<ScanMode>(val, smemf, total);
// Write the result to registers
valN[blockN] = val;
}
//// BEGIN TMP DEV
//return;
//// BEGIN TMP DEV
// Normalize the row CDF. Write data from registers to the CDF
float normalization = (total != 0.0f ? 1.0f / total : 0.0f);
#pragma unroll
for (int blockN = 0, index_x = (threadIdx.x + blockDim.x * blockIdx.x); blockN < N;
++blockN, index_x += blockDim.x)
{
bool inRange = (index_x < numElements);
if (inRange)
{
((float4*)cdf_x)[index_x + index_y * numElements] = valN[blockN] * normalization;
}
}
}
// Let the block store the row running total as the input value to compute the cdf_y (column)
__shared__ bool done;
if (threadIdx.x == 0)
{
cdf_y[index_y] = total;
total = 0; //< reset the running total to use it for cdf_y
// Are we done computing all rows CDFs? This atomic returns true only for the last block
// completing the work.
done = atomicAdd(counter, 1) == (h-1);
}
__syncthreads();
// All blocks terminate here excepts for one
if (!done) return;
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-fence-functions
__threadfence();
// Step 2: compute the conditional cumulative sampling distribution (cdf_y) starting
// from the rows running totals.
{
constexpr int kNumFields = sizeof(float4) / sizeof(float);
const int numElements = divRoundUp(h, kNumFields);
// Step 1, compute the conditianl CDF in place. This is very similar to step 1. However:
// - Don't assume the vertical resolution is lower or equal the horizontal
// - Instead of unrolling the loop and store intermediate values in registers, loop over
// whatever number of iterations there are.
// - Store the non-normalized prefix-sum to cache, read it back in the normalization step.
// The performance hit of this generaliztion is small only because step two executes as a
// single block, which is a tiny fraction of the workload in step 1.
const uint32_t numBlocks = divRoundUp(numElements, blockDim.x);
for (int blockN = 0, index = threadIdx.x; blockN < numBlocks; ++blockN, index += blockDim.x)
{
float4 val = make_float4(0, 0, 0, 0);
bool inRange = (index < numElements);
if (inRange)
{
val = ((float4*)cdf_y)[index];
Remap::factorY(index, numElements, val);
}
// Block-wise prefix sum
PrefixSumBlock<ScanMode>(val, smemf, total);
// Write the not-normalized result to the output buffer
if (inRange) ((float4*)cdf_y)[index] = val;
}
__syncthreads();
// TODO: Is some kind of memory barrier useful here?
// Step 2, read back the cdf_y and normalize it in place.
float normalization = (total != 0.0f ? 1.0f / total : 0.0f);
for (int blockN = 0, index = threadIdx.x; blockN < numBlocks; ++blockN, index += blockDim.x)
{
bool inRange = (index < numElements);
if (inRange)
{
((float4*)cdf_y)[index] *= normalization;
}
}
}
}