//CUDA Streams | A very simple demonstration of cuda streams
//Website: cudaeducation.com
//Company: CUDA Education | cudaeducation.com | cudaeducation@gmail.com | Please donate at cudaeducation.com
//YouTube Channel (please subscribe): Cuda Education | https://www.youtube.com/channel/UCzpwNg0Ai8zCzbsEtozkfFQ
//Twitter: @cudaeducation
//Slack: https://bit.ly/2NBBG4h | Join the workspace
//Have questions? Comment on the YouTube channel https://www.youtube.com/channel/UCzpwNg0Ai8zCzbsEtozkfFQ or message on Twitter
//Donate: Visit cudaeducation.com to donate
//DISCLAIMER: This code is for teaching purposes only! CUDA Education does not guarantee the accuracy of this code in any way. This code should not be used in a production or commercial environment. Any liabilities or loss resulting from the use of this code, in whole or in part, will not be the responsibility of CUDA Education.
//All rights reserved. This code is the property of CUDA Education. Please contact CUDA Education at cudaeducation@gmail.com if you would like to use this code in any way, shape or form.
//CODING ENVIRONMENT:
//CUDA Toolkit 9.0
//Windows environment
//Visual Studio 2017 Community Edition
//nVidia GeForce 1050 ti Graphics Card
//Compute Capability 6.1
//make sure to include all relevant libraries
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <windows.h>
#include <stdio.h>
//function that will run on the GPU
__global__ void cuda_education_function_on_CUDA_GPU()
{
//some pointless processing just to make sure the function is actually called
int gpudummy = 0;
for (int i = 0; i < 100000; i++)
{
gpudummy = gpudummy + 1;
gpudummy = gpudummy - 1;
}
//CUDA EDUCATION
//Website: cudaeducation.com
//Twitter: @cudaeducation
//Email: cudaeducation@gmail.com
//YouTube: Cuda Education | Please subscribe
//Slack: https://bit.ly/2NBBG4h | Join the workspace
//Donate: Visit cudaeducation.com to donate
}
int main(int argc, char **argv)
{
//get the nVidia GPU running CUDA ready
int cuda_education_device = 0;
//set the device to be used for CUDA execution
cudaSetDevice(cuda_education_device);
//set up the CUDA timer for tracking how long it takes the kernel on the device (GPU) to do its job
//for more information on this, check out https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//START create variables for the CUDA kernel launch
//create THREADS_IN_BLOCK variable and declare the number of threads that should be in each block.
//variable type is dim3, which basically means there are 3 dimensions to this variable (no need to discuss further here)
dim3 THREADS_IN_BLOCK(64, 1);
//create BLOCKS_IN_GRID variable and declare the number of blocks to use.
//variable type is dim3, which basically means there are 3 dimensions to this variable (no need to discuss further here)
dim3 BLOCKS_IN_GRID(1, 1);
//print out the values used for block and grid
printf("number of blocks %d | number of threads in each block %d\n", BLOCKS_IN_GRID.x, THREADS_IN_BLOCK.x);
//END create variables for the CUDA kernel launch
//START create streams for cuda kernel launches
//declare stream variables
cudaStream_t stream1;
cudaStream_t stream2;
cudaStream_t stream3;
cudaStream_t stream4;
cudaStream_t stream5;
cudaStream_t stream6;
cudaStream_t stream7;
cudaStream_t stream8;
cudaStream_t stream9;
cudaStream_t stream10;
cudaStream_t stream11;
cudaStream_t stream12;
cudaStream_t stream13;
cudaStream_t stream14;
cudaStream_t stream15;
cudaStream_t stream16;
cudaStream_t stream17;
cudaStream_t stream18;
cudaStream_t stream19;
cudaStream_t stream20;
cudaStream_t stream21;
cudaStream_t stream22;
//create stream
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
cudaStreamCreate(&stream4);
cudaStreamCreate(&stream5);
cudaStreamCreate(&stream6);
cudaStreamCreate(&stream7);
cudaStreamCreate(&stream8);
cudaStreamCreate(&stream9);
cudaStreamCreate(&stream10);
cudaStreamCreate(&stream11);
cudaStreamCreate(&stream12);
cudaStreamCreate(&stream13);
cudaStreamCreate(&stream14);
cudaStreamCreate(&stream15);
cudaStreamCreate(&stream16);
cudaStreamCreate(&stream17);
cudaStreamCreate(&stream18);
cudaStreamCreate(&stream19);
cudaStreamCreate(&stream20);
cudaStreamCreate(&stream21);
cudaStreamCreate(&stream22);
//END create streams for cuda kernel launches
//declare shared memory variable
//not relevant in this example, but have to fill in a value when launching kernel with streams
int my_shared_memory = 0;
//make sure all preceding tasks on the device (GPU) has been completed before proceeding with new business. wouldn't want to mix ingredients from two completely different recipes now would you?!?!
cudaDeviceSynchronize();
//"start" event marker. this is simply a marker in the code
cudaEventRecord(start);
//launch the CUDA kernel
//take note of the launch parameters for the kernels
//several kernel launches on multiple streams at the same time. this is where kernel concurrency happens
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream1 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream2 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream3 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream4 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream5 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream6 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream7 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream8 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream9 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream10 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream11 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream12 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream13 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream14 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream15 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream16 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream17 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream18 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream19 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream20 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream21 >> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream22 >> > ();
//several kernel launches on the null stream (the default stream). sequential processing. kernels are launched and completed one after the other. no kernel concurrency
/*
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
*/
//"stop" event marker. this is a simply a marker in the code. it doesn't not guarantee that everything preceding the marker has finished processing or is concluded.
cudaEventRecord(stop);
//check to make sure all preceding business before the "stop" marker is complete before proceeding
//remember that by default the CPU will launch the kernel on the device (GPU) and immediately continue processing other instructions.
//we use cudaEventSynchronize(stop) to tell the CPU to halt further processing until all preceding business before the stop event flag has finished processing or is concluded.
cudaEventSynchronize(stop);
//now that we can guarantee that all preceding business before the "stop" marker is finished, we can calculate and print the amount of time the event took
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("DEVICE elapsed time: %f milliseconds\n", milliseconds);
//practice good housekeeping by resetting the device and destroying event variables when you are done
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaDeviceReset();
//CUDA EDUCATION
//Website: cudaeducation.com
//Twitter: @cudaeducation
//Email: cudaeducation@gmail.com
//YouTube: Cuda Education
//Slack: https://bit.ly/2NBBG4h | Join the workspace
//Donate: Visit cudaeducation.com to donate to the cause
}