A brief outline of how to go about debugging you CUDA program.
- Probably the best way in my opinion is to use printf functionality to print out variable and system state to the console.
- printf can be used in host code and device code, assuming a compute capability of 3.0 (i think) or higher.
- Make sure that you are not printing too much to the printf buffer that you start overwriting old information with newer information and therefore reading misleading information.
- The code below prints out the size of the printf buffer, so you can have an idea of what you are up against.
- You can also use the assert function to test if specific values are true in your code.
- The assert function is also able to evaluate built-in CUDA variables in host and device code, so that is helpful when tackling very detailed processes.
- In my experience, it will throw an assertion for each and every thread that it comes across and display it in the output.
- Please note that assertions abort processing immediately.
- Make sure to include <assert.h> before using the assert function.
- See the video for an example of assertions.
- See the code below to learn how to implement it
- Use memcheck and racecheck for memory status and debugging
- If you are running on Linux or Mac system, you can use cuda-gdb
- Nvprof and Visual Profiler are tools used to evaluate the speed of your application, but it can also be helpful with debugging in certain instances
- Learn how to run nvprof from the command promptĀ https://cudaeducation.com/nvprof/
- Watch a very basic walkthrough of NVIDIA Visual ProfilerĀ https://cudaeducation.com/nvidia-visual-profiler-a-very-basic-walkthrough-cuda-programming/
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
//CUDA EDUCATION //CUDA Debugging Tutorial | A quick overview of CUDA debugging //Website: cudaeducation.com //Twitter: @cudaeducation //Email: cudaeducation@gmail.com //YouTube: Cuda Education | Please subscribe //Slack: https://bit.ly/2NBBG4h | Join the workspace //Mailing List: Visit cudaeducation.com to join our mailing list //Donate: Visit cudaeducation.com to donate //DISCLAIMER: This code is for teaching purposes only! CUDA Education does not guarantee the accuracy of this code in any way. This code should not be used in a production or commercial environment. Any liabilities or loss resulting from the use of this code, in whole or in part, will not be the responsibility of CUDA Education. //All rights reserved. This code is the property of CUDA Education. Please contact CUDA Education at cudaeducation@gmail.com if you would like to use this code in any way, shape or form. //CODING ENVIRONMENT: //CUDA Toolkit 10.0 //Windows environment //Visual Studio 2017 Community Edition //nVidia GeForce 1050 ti Graphics Card //Compute Capability 6.1 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <assert.h> cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); __global__ void addKernel(int *c, const int *a, const int *b) { int i = threadIdx.x; c[i] = a[i] + b[i]; //use this to throw an assertion if the tested value does not hold true assert(threadIdx.x == threadIdx.x); //use this only on linux and mac systems //cuda-gdb //other tools that you can use to check memory status etc. //memcheck //racecheck //you can always just print out your threadid, blockid, grid dimension and block dimension //but for really large applications where you will be printing out a lot of stuff, be aware of the fact that there is an output buffer and if the contents gets too large your output mind be overwritten by newer information/data printf("thread: %d\n", threadIdx.x); printf("block: %d\n", blockIdx.x); printf("gridDim: %d\n", gridDim); printf("blockDim: %d\n", blockDim); printf("warpSize: %d\n", warpSize); } int main() { //find out how large the printf buffer is //this must be set before launching any kernels size_t size; cudaDeviceGetLimit(&size, cudaLimitPrintfFifoSize); printf("Printf size found to be %d\n", (int)size); //use this to set the size of the buffer //cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 100 * sizeof(float)); const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 }; // Add vectors in parallel. cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!"); return 1; } printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } return 0; } // Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) { int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } // Launch a kernel on the GPU with one thread for each element. addKernel<<<1, size>>>(dev_c, dev_a, dev_b); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus; } |