Last active
February 16, 2016 14:09
-
-
Save gravitino/3b694b6bd6af6937a2b2 to your computer and use it in GitHub Desktop.
branch divergent-free if in CUDA considered harmful (benchmarked on Titan X)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
// error makro | |
#define CUERR { \ | |
cudaError_t err; \ | |
if ((err = cudaGetLastError()) != cudaSuccess) { \ | |
std::cout << "CUDA error: " << cudaGetErrorString(err) << " : " \ | |
<< __FILE__ << ", line " << __LINE__ << std::endl; \ | |
exit(1); \ | |
} \ | |
} | |
// convenient timers | |
#define TIMERSTART(label) \ | |
cudaEvent_t start##label, stop##label; \ | |
float time##label; \ | |
cudaEventCreate(&start##label); \ | |
cudaEventCreate(&stop##label); \ | |
cudaEventRecord(start##label, 0); | |
#define TIMERSTOP(label) \ | |
cudaEventRecord(stop##label, 0); \ | |
cudaEventSynchronize(stop##label); \ | |
cudaEventElapsedTime(&time##label, start##label, stop##label); \ | |
std::cout << time##label << " ms (" << #label << ")" << std::endl; | |
template <class value_t, class index_t, bool sigma, int n_iters=100> __global__ | |
void templated_kernel(value_t * data, index_t length) { | |
const index_t thid = blockDim.x*blockIdx.x+threadIdx.x; | |
for (index_t id = thid; id < length; id += blockDim.x*gridDim.x) { | |
value_t value = data[id]; | |
for (index_t iter = 0; iter < n_iters; iter++) | |
if (sigma) | |
value = cos(value); | |
else | |
value = sin(value); | |
data[id] = value; | |
} | |
} | |
template <class value_t, class index_t, int n_iters=100> __global__ | |
void parametrized_kernel(value_t * data, index_t length, bool sigma) { | |
const index_t thid = blockDim.x*blockIdx.x+threadIdx.x; | |
for (index_t id = thid; id < length; id += blockDim.x*gridDim.x) { | |
value_t value = data[id]; | |
for (index_t iter = 0; iter < n_iters; iter++) | |
if (sigma) | |
value = cos(value); | |
else | |
value = sin(value); | |
data[id] = value; | |
} | |
} | |
// compile with nvcc -O3 -std=c++11 -arch=sm_35 cuda_if.cu | |
int main () { | |
typedef size_t index_t; | |
typedef float value_t; | |
index_t length = 1 << 30; | |
value_t * data = nullptr; | |
cudaMalloc(&data, sizeof(value_t)*length); CUERR | |
cudaMemset(data, 0, sizeof(value_t)*length); CUERR | |
// 2263.26 ms | |
// 2300.9 ms | |
// 2290.04 ms | |
TIMERSTART(templated) | |
templated_kernel<value_t, index_t, 0><<<1024, 1024>>>(data, length); CUERR | |
templated_kernel<value_t, index_t, 1><<<1024, 1024>>>(data, length); CUERR | |
TIMERSTOP(templated) | |
// 2745.73 ms | |
// 2729.74 ms | |
// 2745.31 ms | |
TIMERSTART(parametrized) | |
parametrized_kernel<<<1024, 1024>>>(data, length, 0); CUERR | |
parametrized_kernel<<<1024, 1024>>>(data, length, 1); CUERR | |
TIMERSTOP(parametrized) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment