| #pragma once |
|
|
| #include <stdio.h> |
|
|
| #if defined(__HIPCC__) |
| #define HOST_DEVICE_INLINE __host__ __device__ |
| #define DEVICE_INLINE __device__ |
| #define HOST_INLINE __host__ |
| #elif defined(__CUDACC__) || defined(_NVHPC_CUDA) |
| #define HOST_DEVICE_INLINE __host__ __device__ __forceinline__ |
| #define DEVICE_INLINE __device__ __forceinline__ |
| #define HOST_INLINE __host__ __forceinline__ |
| #else |
| #define HOST_DEVICE_INLINE inline |
| #define DEVICE_INLINE inline |
| #define HOST_INLINE inline |
| #endif |
|
|
| #define CUDA_CHECK(cmd) \ |
| do { \ |
| cudaError_t e = cmd; \ |
| if (e != cudaSuccess) { \ |
| printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ |
| cudaGetErrorString(e)); \ |
| exit(EXIT_FAILURE); \ |
| } \ |
| } while (0) |
|
|
| int64_t get_device_attribute(int64_t attribute, int64_t device_id); |
|
|
| int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id); |
|
|
| namespace cuda_utils { |
|
|
| template <typename T> |
| HOST_DEVICE_INLINE constexpr std::enable_if_t<std::is_integral_v<T>, T> |
| ceil_div(T a, T b) { |
| return (a + b - 1) / b; |
| } |
|
|
| }; |