//
//
//
//
//
//CUDA Synchronization | A quick overview of synchronization in CUDA | Cuda Education | Cuda Tutorial
//Company: CUDA Education | cudaeducation.com | cudaeducation@gmail.com | Please donate at cudaeducation.com
//YouTube Channel (please subscribe): https://www.youtube.com/channel/UCzpwNg0Ai8zCzbsEtozkfFQ
//Twitter: @cudaeducation
//Have questions? Comment on the YouTube channel https://www.youtube.com/channel/UCzpwNg0Ai8zCzbsEtozkfFQ or email cudaeducation@gmail.com
//Donate: Visit cudaeducation.com to donate to the cause
//DISCLAIMER: This code is for teaching purposes only! CUDA Education does not guarantee the accuracy of this code in any way. This code should not be used in a production or commercial environment. Any liabilities or loss resulting from the use of this code, in whole or in part, will not be the responsibility of CUDA Education.
//All rights reserved. This code is the property of CUDA Education. Please contact CUDA Education at cudaeducation@gmail.com if you would like to use this code in any way, shape or form.
//CODING ENVIRONMENT:
//CUDA Toolkit 10.0
//Windows environment
//Visual Studio 2017 Community Edition
//nVidia GeForce 1050 ti Graphics Card
//Compute Capability 6.5
//Install CUDA Toolkit cudaeducation.com/howtoprogramcuda
//make sure to include all relevant libraries
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <windows.h>
#include <stdio.h>
__global__ void warp_divergence_on_CUDA_GPU(unsigned int gpudummy)
{
//calculate the linear address of the thread.
//this is the thread's index among all threads available in the x dimension.
//the GPU only sees linear addresses.
//for example, threadIdx.x might equal 0, but we are not sure for which block this threadIdx.x is in, so we have to calculate the linear address to know exactly which thread in which block we are talking about.
//in other words, the first thread in block 1 will have a different linear address than the first thread in block 7. both threads will have threadIdx.x = 0, but they are completely different so we use the linear address to know where we REALLY are in the bigger scheme of things.
//for more information on why this calculation is done, check out https://www.pelagos-consulting.com/wp-content/uploads/2017/08/CUDA_description.pdf
unsigned int linear_id_of_thread = blockIdx.x * blockDim.x + threadIdx.x;
//START thread warp divergence code
//when you comment out the "IF ELSE" statement, you will see the performance increase drastically. this is because the warps are not split apart into various segments and it can process the full 32 threads in each cycle with no delays.
//if you do not comment it out, performance is slow. this is because warps can only take a single path in each cycle, so if the "IF" path is taken on a given cycle, the threads that are supposed to take the "ELSE" path are left idle until the next cycle. obviously this wastes cycles, processing power and productivity. and that is why the processing time is longer.
//the moral of the story is that conditional statements slow up processing for a given warp if there are different paths taken within the warp.
__syncthreads();
if (linear_id_of_thread % 2 == 0)
{
__syncthreads();
gpudummy = gpudummy + 1;
__syncthreads();
}
else
{
__syncthreads();
gpudummy = gpudummy - 1;
__syncthreads();
}
__syncthreads();
//END thread warp divergence code
//CUDA EDUCATION
//Website: cudaeducation.com
//Twitter: @cudaeducation
//Email: cudaeducation@gmail.com
//YouTube: Cuda Education | Please subscribe
//Donate: Visit cudaeducation.com to donate to the cause
}
int main(int argc, char **argv)
{
//get the nVidia GPU running CUDA ready
int cuda_education_device = 0;
//set the device to be used for CUDA execution
cudaSetDevice(cuda_education_device);
//how many integers are going to be in your array?
int my_array_size = 80000000;
//our dummy variable just to keep the show going
int dummy = 0;
//print out the number of integers in your array
printf("array size %d \n", my_array_size);
//how many threads in each BLOCK will be launched in the CUDA kernel?
int number_of_threads_in_each_block = 256;
//set up the CUDA timer for tracking how long it takes the kernel on the device (GPU) to do its job
//for more information on this, check out https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//START create variables for the CUDA kernel launch
//create BLOCK variable and declare the number of threads that should be in it.
//we DO NOT declare the number of blocks here, just the number of threads in each block
//variable type is dim3, which basically means there are 3 dimensions (no need to discuss further here)
dim3 BLOCK(number_of_threads_in_each_block, 1);
//create grid variable and declare the number of BLOCKS in the grid.
//the number of BLOCKS in the grid is usually a calculated number (based on the size of the array and the number of threads in each block -> you need enough threads to tackle all the elements in the array).
//variable type is dim3, which basically means there are 3 dimensions (no need to discuss further here)
dim3 cuda_education_grid((my_array_size + BLOCK.x - 1) / BLOCK.x, 1);
//print out the values used for block and grid
printf("number of blocks %d | number of threads in each block %d\n", cuda_education_grid.x, BLOCK.x);
//END create variables for the CUDA kernel launch
//make sure all preceding tasks on the device (GPU) has been completed before proceeding with new business. wouldn't want to mix ingredients from two completely different recipes now would you?!?!
cudaDeviceSynchronize();
//start recording event
cudaEventRecord(start);
//launch the CUDA kernel
//cuda_education_grid and BLOCK are used as launch parameters
warp_divergence_on_CUDA_GPU << <cuda_education_grid, BLOCK >> > (dummy);
//stop recording event
cudaEventRecord(stop);
//another check to make sure all preceding business on the device (GPU) is complete before proceeding
//remember that by default the CPU will launch the kernel and immediately continue processing other instructions.
//we use cudaDeviceSynchronize() to tell the CPU to halt further processing until the CUDA kernel has finished doing its business
cudaDeviceSynchronize();
cudaEventSynchronize(stop);
//calculate and print the amount of time it took the CUDA kernel to process
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("DEVICE elapsed time: %f milliseconds\n", milliseconds);
//practice good housekeeping by resetting the device when you are done
cudaDeviceReset();
//CUDA EDUCATION
//Website: cudaeducation.com
//Twitter: @cudaeducation
//Email: cudaeducation@gmail.com
//YouTube: Cuda Education
//Donate: Visit cudaeducation.com to donate to the cause
}