CUDA Debugging Tutorial | Cuda Education

Posted on January 8, 2019 by admin

NVIDIA CUDA / GPU Programming | Tutorial

A brief outline of how to go about debugging you CUDA program.

Probably the best way in my opinion is to use printf functionality to print out variable and system state to the console.
- printf can be used in host code and device code, assuming a compute capability of 3.0 (i think) or higher.
- Make sure that you are not printing too much to the printf buffer that you start overwriting old information with newer information and therefore reading misleading information.
- The code below prints out the size of the printf buffer, so you can have an idea of what you are up against.

You can also use the assert function to test if specific values are true in your code.
- The assert function is also able to evaluate built-in CUDA variables in host and device code, so that is helpful when tackling very detailed processes.
- In my experience, it will throw an assertion for each and every thread that it comes across and display it in the output.
- Please note that assertions abort processing immediately.
- Make sure to include <assert.h> before using the assert function.
- See the video for an example of assertions.
- See the code below to learn how to implement it

Use memcheck and racecheck for memory status and debugging
If you are running on Linux or Mac system, you can use cuda-gdb
Nvprof and Visual Profiler are tools used to evaluate the speed of your application, but it can also be helpful with debugging in certain instances
- Learn how to run nvprof from the command prompt https://cudaeducation.com/nvprof/
- Watch a very basic walkthrough of NVIDIA Visual Profiler https://cudaeducation.com/nvidia-visual-profiler-a-very-basic-walkthrough-cuda-programming/

//CUDA EDUCATION
//CUDA Debugging Tutorial | A quick overview of CUDA debugging 
//Website: cudaeducation.com
//Twitter: @cudaeducation
//Email: cudaeducation@gmail.com
//YouTube: Cuda Education | Please subscribe
//Slack: https://bit.ly/2NBBG4h | Join the workspace
//Mailing List: Visit cudaeducation.com to join our mailing list
//Donate: Visit cudaeducation.com to donate

//DISCLAIMER: This code is for teaching purposes only! CUDA Education does not guarantee the accuracy of this code in any way.  This code should not be used in a production or commercial environment.  Any liabilities or loss resulting from the use of this code, in whole or in part, will not be the responsibility of CUDA Education.
//All rights reserved.  This code is the property of CUDA Education.  Please contact CUDA Education at cudaeducation@gmail.com if you would like to use this code in any way, shape or form.


//CODING ENVIRONMENT:
//CUDA Toolkit 10.0
//Windows environment
//Visual Studio 2017 Community Edition
//nVidia GeForce 1050 ti Graphics Card
//Compute Capability 6.1




#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <assert.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];

	//use this to throw an assertion if the tested value does not hold true
	assert(threadIdx.x == threadIdx.x);


	//use this only on linux and mac systems
	//cuda-gdb

	//other tools that you can use to check memory status etc.
	//memcheck
	//racecheck

	//you can always just print out your threadid, blockid, grid dimension and block dimension
	//but for really large applications where you will be printing out a lot of stuff, be aware of the fact that there is an output buffer and if the contents gets too large your output mind be overwritten by newer information/data 
	printf("thread: %d\n", threadIdx.x);
	printf("block: %d\n", blockIdx.x);
	printf("gridDim: %d\n", gridDim);
	printf("blockDim: %d\n", blockDim);
	printf("warpSize: %d\n", warpSize);

	

	
}

int main()
{
	//find out how large the printf buffer is
	//this must be set before launching any kernels
	size_t size;
	cudaDeviceGetLimit(&size, cudaLimitPrintfFifoSize);
	printf("Printf size found to be %d\n", (int)size);


	//use this to set the size of the buffer
	//cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 100 * sizeof(float));
	
	const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

//CUDA EDUCATION

//CUDA Debugging Tutorial | A quick overview of CUDA debugging

//Website: cudaeducation.com

//Twitter: @cudaeducation

//Email: cudaeducation@gmail.com

//YouTube: Cuda Education | Please subscribe

//Slack: https://bit.ly/2NBBG4h | Join the workspace

//Mailing List: Visit cudaeducation.com to join our mailing list

//Donate: Visit cudaeducation.com to donate

//DISCLAIMER: This code is for teaching purposes only! CUDA Education does not guarantee the accuracy of this code in any way. This code should not be used in a production or commercial environment. Any liabilities or loss resulting from the use of this code, in whole or in part, will not be the responsibility of CUDA Education.

//CODING ENVIRONMENT:

//CUDA Toolkit 10.0

//Windows environment

//Visual Studio 2017 Community Edition

//nVidia GeForce 1050 ti Graphics Card

//Compute Capability 6.1

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <assert.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)

{

int i = threadIdx.x;

c[i] = a[i] + b[i];

//use this to throw an assertion if the tested value does not hold true

assert(threadIdx.x == threadIdx.x);

//use this only on linux and mac systems

//cuda-gdb

//other tools that you can use to check memory status etc.

//memcheck

//racecheck

//you can always just print out your threadid, blockid, grid dimension and block dimension

//but for really large applications where you will be printing out a lot of stuff, be aware of the fact that there is an output buffer and if the contents gets too large your output mind be overwritten by newer information/data

printf("thread: %d\n", threadIdx.x);

printf("block: %d\n", blockIdx.x);

printf("gridDim: %d\n", gridDim);

printf("blockDim: %d\n", blockDim);

printf("warpSize: %d\n", warpSize);

}

int main()

{

//find out how large the printf buffer is

//this must be set before launching any kernels

size_t size;

cudaDeviceGetLimit(&size, cudaLimitPrintfFifoSize);

printf("Printf size found to be %d\n", (int)size);

//use this to set the size of the buffer

//cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 100 * sizeof(float));

const int arraySize = 5;

const int a[arraySize] = { 1, 2, 3, 4, 5 };

const int b[arraySize] = { 10, 20, 30, 40, 50 };

int c[arraySize] = { 0 };

// Add vectors in parallel.

cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addWithCuda failed!");

return 1;

}

printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",

c[0], c[1], c[2], c[3], c[4]);

// cudaDeviceReset must be called before exiting in order for profiling and

// tracing tools such as Nsight and Visual Profiler to show complete traces.

cudaStatus = cudaDeviceReset();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceReset failed!");

return 1;

}

return 0;

}

// Helper function for using CUDA to add vectors in parallel.

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)

{

int *dev_a = 0;

int *dev_b = 0;

int *dev_c = 0;

cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.

cudaStatus = cudaSetDevice(0);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");

goto Error;

}

// Allocate GPU buffers for three vectors (two input, one output) .

cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

// Copy input vectors from host memory to GPU buffers.

cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

// Launch a kernel on the GPU with one thread for each element.

addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

// Check for any errors launching the kernel

cudaStatus = cudaGetLastError();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));

goto Error;

}

// cudaDeviceSynchronize waits for the kernel to finish, and returns

// any errors encountered during the launch.

cudaStatus = cudaDeviceSynchronize();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

goto Error;

}

// Copy output vector from GPU buffer to host memory.

cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

Error:

cudaFree(dev_c);

cudaFree(dev_a);

cudaFree(dev_b);

return cudaStatus;

}