23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
26#if !defined(__HIPCC_RTC__)
30#if defined(__has_attribute) && __has_attribute(maybe_undef)
31#define MAYBE_UNDEF __attribute__((maybe_undef))
36__device__
static inline unsigned __hip_ds_bpermute(
int index,
unsigned src) {
37 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
38 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
42__device__
static inline float __hip_ds_bpermutef(
int index,
float src) {
43 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
44 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
48__device__
static inline unsigned __hip_ds_permute(
int index,
unsigned src) {
49 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
50 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
54__device__
static inline float __hip_ds_permutef(
int index,
float src) {
55 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
56 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
60#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
61#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
64__device__
static inline unsigned __hip_ds_swizzle_N(
unsigned int src) {
65 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
66 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
71__device__
static inline float __hip_ds_swizzlef_N(
float src) {
72 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
73 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
77#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
78 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
80template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
81__device__
static inline int __hip_move_dpp_N(
int src) {
82 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
87static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
92int __all(
int predicate) {
93 return __ockl_wfall_i32(predicate);
98int __any(
int predicate) {
99 return __ockl_wfany_i32(predicate);
107unsigned long long int __ballot(
int predicate) {
108 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
113unsigned long long int __ballot64(
int predicate) {
114 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
118#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
123unsigned long long __activemask() {
124 return __ballot(
true);
128__device__
static inline unsigned int __lane_id() {
129 return __builtin_amdgcn_mbcnt_hi(
130 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
135int __shfl(MAYBE_UNDEF
int var,
int src_lane,
int width = warpSize) {
136 int self = __lane_id();
137 int index = (src_lane & (width - 1)) + (self & ~(width-1));
138 return __builtin_amdgcn_ds_bpermute(index<<2, var);
142unsigned int __shfl(MAYBE_UNDEF
unsigned int var,
int src_lane,
int width = warpSize) {
143 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
144 tmp.i = __shfl(tmp.i, src_lane, width);
149float __shfl(MAYBE_UNDEF
float var,
int src_lane,
int width = warpSize) {
150 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
151 tmp.i = __shfl(tmp.i, src_lane, width);
156double __shfl(MAYBE_UNDEF
double var,
int src_lane,
int width = warpSize) {
157 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
158 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
160 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
161 tmp[0] = __shfl(tmp[0], src_lane, width);
162 tmp[1] = __shfl(tmp[1], src_lane, width);
164 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
165 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
170long __shfl(MAYBE_UNDEF
long var,
int src_lane,
int width = warpSize)
173 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
174 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
176 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
177 tmp[0] = __shfl(tmp[0], src_lane, width);
178 tmp[1] = __shfl(tmp[1], src_lane, width);
180 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
181 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
184 static_assert(
sizeof(long) ==
sizeof(int),
"");
185 return static_cast<long>(__shfl(
static_cast<int>(var), src_lane, width));
190unsigned long __shfl(MAYBE_UNDEF
unsigned long var,
int src_lane,
int width = warpSize) {
192 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
193 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
195 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
196 tmp[0] = __shfl(tmp[0], src_lane, width);
197 tmp[1] = __shfl(tmp[1], src_lane, width);
199 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
200 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
203 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
204 return static_cast<unsigned long>(__shfl(
static_cast<unsigned int>(var), src_lane, width));
209long long __shfl(MAYBE_UNDEF
long long var,
int src_lane,
int width = warpSize)
211 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
212 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
214 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
215 tmp[0] = __shfl(tmp[0], src_lane, width);
216 tmp[1] = __shfl(tmp[1], src_lane, width);
218 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
219 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
224unsigned long long __shfl(MAYBE_UNDEF
unsigned long long var,
int src_lane,
int width = warpSize) {
225 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
226 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
228 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
229 tmp[0] = __shfl(tmp[0], src_lane, width);
230 tmp[1] = __shfl(tmp[1], src_lane, width);
232 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
233 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
239int __shfl_up(MAYBE_UNDEF
int var,
unsigned int lane_delta,
int width = warpSize) {
240 int self = __lane_id();
241 int index = self - lane_delta;
242 index = (index < (self & ~(width-1)))?self:index;
243 return __builtin_amdgcn_ds_bpermute(index<<2, var);
247unsigned int __shfl_up(MAYBE_UNDEF
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
248 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
249 tmp.i = __shfl_up(tmp.i, lane_delta, width);
254float __shfl_up(MAYBE_UNDEF
float var,
unsigned int lane_delta,
int width = warpSize) {
255 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
256 tmp.i = __shfl_up(tmp.i, lane_delta, width);
261double __shfl_up(MAYBE_UNDEF
double var,
unsigned int lane_delta,
int width = warpSize) {
262 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
263 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
265 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
266 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
267 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
269 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
270 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
275long __shfl_up(MAYBE_UNDEF
long var,
unsigned int lane_delta,
int width = warpSize)
278 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
279 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
281 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
282 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
283 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
285 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
286 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
289 static_assert(
sizeof(long) ==
sizeof(int),
"");
290 return static_cast<long>(__shfl_up(
static_cast<int>(var), lane_delta, width));
296unsigned long __shfl_up(MAYBE_UNDEF
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
299 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
300 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
302 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
303 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
304 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
306 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
307 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
310 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
311 return static_cast<unsigned long>(__shfl_up(
static_cast<unsigned int>(var), lane_delta, width));
317long long __shfl_up(MAYBE_UNDEF
long long var,
unsigned int lane_delta,
int width = warpSize)
319 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
320 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
321 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
322 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
323 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
324 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
325 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
331unsigned long long __shfl_up(MAYBE_UNDEF
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
333 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
334 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
335 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
336 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
337 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
338 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
339 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
345int __shfl_down(MAYBE_UNDEF
int var,
unsigned int lane_delta,
int width = warpSize) {
346 int self = __lane_id();
347 int index = self + lane_delta;
348 index = (int)((self&(width-1))+lane_delta) >= width?self:index;
349 return __builtin_amdgcn_ds_bpermute(index<<2, var);
353unsigned int __shfl_down(MAYBE_UNDEF
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
354 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
355 tmp.i = __shfl_down(tmp.i, lane_delta, width);
360float __shfl_down(MAYBE_UNDEF
float var,
unsigned int lane_delta,
int width = warpSize) {
361 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
362 tmp.i = __shfl_down(tmp.i, lane_delta, width);
367double __shfl_down(MAYBE_UNDEF
double var,
unsigned int lane_delta,
int width = warpSize) {
368 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
369 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
371 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
372 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
373 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
375 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
376 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
381long __shfl_down(MAYBE_UNDEF
long var,
unsigned int lane_delta,
int width = warpSize)
384 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
385 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
387 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
388 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
389 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
391 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
392 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
395 static_assert(
sizeof(long) ==
sizeof(int),
"");
396 return static_cast<long>(__shfl_down(
static_cast<int>(var), lane_delta, width));
401unsigned long __shfl_down(MAYBE_UNDEF
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
404 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
405 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
407 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
408 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
409 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
411 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
412 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
415 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
416 return static_cast<unsigned long>(__shfl_down(
static_cast<unsigned int>(var), lane_delta, width));
421long long __shfl_down(MAYBE_UNDEF
long long var,
unsigned int lane_delta,
int width = warpSize)
423 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
424 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
425 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
426 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
427 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
428 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
429 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
434unsigned long long __shfl_down(MAYBE_UNDEF
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
436 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
437 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
438 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
439 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
440 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
441 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
442 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
448int __shfl_xor(MAYBE_UNDEF
int var,
int lane_mask,
int width = warpSize) {
449 int self = __lane_id();
450 int index = self^lane_mask;
451 index = index >= ((self+width)&~(width-1))?self:index;
452 return __builtin_amdgcn_ds_bpermute(index<<2, var);
456unsigned int __shfl_xor(MAYBE_UNDEF
unsigned int var,
int lane_mask,
int width = warpSize) {
457 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
458 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
463float __shfl_xor(MAYBE_UNDEF
float var,
int lane_mask,
int width = warpSize) {
464 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
465 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
470double __shfl_xor(MAYBE_UNDEF
double var,
int lane_mask,
int width = warpSize) {
471 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
472 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
474 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
475 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
476 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
478 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
479 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
484long __shfl_xor(MAYBE_UNDEF
long var,
int lane_mask,
int width = warpSize)
487 static_assert(
sizeof(long) == 2 *
sizeof(int),
"");
488 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
490 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
491 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
492 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
494 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
495 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
498 static_assert(
sizeof(long) ==
sizeof(int),
"");
499 return static_cast<long>(__shfl_xor(
static_cast<int>(var), lane_mask, width));
504unsigned long __shfl_xor(MAYBE_UNDEF
unsigned long var,
int lane_mask,
int width = warpSize)
507 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
508 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
510 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
511 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
512 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
514 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
515 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
518 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
519 return static_cast<unsigned long>(__shfl_xor(
static_cast<unsigned int>(var), lane_mask, width));
524long long __shfl_xor(MAYBE_UNDEF
long long var,
int lane_mask,
int width = warpSize)
526 static_assert(
sizeof(
long long) == 2 *
sizeof(int),
"");
527 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
528 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
529 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
530 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
531 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
532 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
537unsigned long long __shfl_xor(MAYBE_UNDEF
unsigned long long var,
int lane_mask,
int width = warpSize)
539 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
540 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
541 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
542 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
543 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
544 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
545 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...