Tag: cuda education

CUDA Environment Variables

A discussion of CUDA environment variables that enables you to accomplish more things in your code. See video below. Please donate using the link in the sidebar. List of CUDA environment variables: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars Download code: Cuda_Education_Environment_Variables

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}


int setenv(const char *name, const char *value, int overwrite)
{
	int errcode = 0;
	if (!overwrite) {
		size_t envsize = 0;
		errcode = getenv_s(&envsize, NULL, 0, name);
		if (errcode || envsize) return errcode;
	}
	return _putenv_s(name, value);
}

int main()
{
	
	
	// set up max connection	
	char* iname = "CUDA_VISIBLE_DEVICES";
	setenv(iname, "0", 1);
	char *ivalue = getenv(iname);
	printf("%s = %s\n", iname, ivalue);
	
	
	
	
	
	const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)

{

int i = threadIdx.x;

c[i] = a[i] + b[i];

}

int setenv(const char *name, const char *value, int overwrite)

{

int errcode = 0;

if (!overwrite) {

size_t envsize = 0;

errcode = getenv_s(&envsize, NULL, 0, name);

if (errcode || envsize) return errcode;

}

return _putenv_s(name, value);

}

int main()

{

// set up max connection

char* iname = "CUDA_VISIBLE_DEVICES";

setenv(iname, "0", 1);

char *ivalue = getenv(iname);

printf("%s = %s\n", iname, ivalue);

const int arraySize = 5;

const int a[arraySize] = { 1, 2, 3, 4, 5 };

const int b[arraySize] = { 10, 20, 30, 40, 50 };

int c[arraySize] = { 0 };

// Add vectors in parallel.

cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addWithCuda failed!");

return 1;

}

printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",

c[0], c[1], c[2], c[3], c[4]);

// cudaDeviceReset must be called before exiting in order for profiling and

// tracing tools such as Nsight and Visual Profiler to show complete traces.

cudaStatus = cudaDeviceReset();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceReset failed!");

return 1;

}

return 0;

}

// Helper function for using CUDA to add vectors in parallel.

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)

{

int *dev_a = 0;

int *dev_b = 0;

int *dev_c = 0;

cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.

cudaStatus = cudaSetDevice(0);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");

goto Error;

}

// Allocate GPU buffers for three vectors (two input, one output) .

cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

// Copy input vectors from host memory to GPU buffers.

cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

// Launch a kernel on the GPU with one thread for each element.

addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

// Check for any errors launching the kernel

cudaStatus = cudaGetLastError();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));

goto Error;

}

// cudaDeviceSynchronize waits for the kernel to finish, and returns

// any errors encountered during the launch.

cudaStatus = cudaDeviceSynchronize();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

goto Error;

}

// Copy output vector from GPU buffer to host memory.

cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

Error:

cudaFree(dev_c);

cudaFree(dev_a);

cudaFree(dev_b);

return cudaStatus;

}

Donate

NVIDIA CUDA / GPU Programming | Tutorial

Posted on November 6, 2018 by admin

CUDA Streams | Concurrent Kernel Launches | Downloadable Code | NVIDIA Visual Profiler

Donate CUDA Streams tutorial with downloadable code. Concurrent kernel launches with event management and NVIDIA Visual Profiler analysis. Download CUDA Streams code here

//CUDA Streams | A very simple demonstration of cuda streams
//Website: cudaeducation.com
//Company: CUDA Education | cudaeducation.com | cudaeducation@gmail.com | Please donate at cudaeducation.com
//YouTube Channel (please subscribe): Cuda Education | https://www.youtube.com/channel/UCzpwNg0Ai8zCzbsEtozkfFQ
//Twitter: @cudaeducation
//Slack: https://bit.ly/2NBBG4h | Join the workspace
//Have questions? Comment on the YouTube channel https://www.youtube.com/channel/UCzpwNg0Ai8zCzbsEtozkfFQ or message on Twitter
//Donate: Visit cudaeducation.com to donate
//DISCLAIMER: This code is for teaching purposes only! CUDA Education does not guarantee the accuracy of this code in any way.  This code should not be used in a production or commercial environment.  Any liabilities or loss resulting from the use of this code, in whole or in part, will not be the responsibility of CUDA Education.
//All rights reserved.  This code is the property of CUDA Education.  Please contact CUDA Education at cudaeducation@gmail.com if you would like to use this code in any way, shape or form.

//CODING ENVIRONMENT:
//CUDA Toolkit 9.0
//Windows environment
//Visual Studio 2017 Community Edition
//nVidia GeForce 1050 ti Graphics Card
//Compute Capability 6.1

//make sure to include all relevant libraries
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <windows.h>
#include <stdio.h>





//function that will run on the GPU
__global__ void cuda_education_function_on_CUDA_GPU()
{

	
	//some pointless processing just to make sure the function is actually called
	int gpudummy = 0;

	for (int i = 0; i < 100000; i++)
	{
		gpudummy = gpudummy + 1;
		gpudummy = gpudummy - 1;
	}


	
	//CUDA EDUCATION
	//Website: cudaeducation.com
	//Twitter: @cudaeducation
	//Email: cudaeducation@gmail.com
	//YouTube: Cuda Education | Please subscribe
	//Slack: https://bit.ly/2NBBG4h | Join the workspace
	//Donate: Visit cudaeducation.com to donate



}







int main(int argc, char **argv)
{


	//get the nVidia GPU running CUDA ready
	int cuda_education_device = 0;

	//set the device to be used for CUDA execution
	cudaSetDevice(cuda_education_device);

	//set up the CUDA timer for tracking how long it takes the kernel on the device (GPU) to do its job
	//for more information on this, check out https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	

	//START create variables for the CUDA kernel launch


	//create THREADS_IN_BLOCK variable and declare the number of threads that should be in each block.
	//variable type is dim3, which basically means there are 3 dimensions to this variable (no need to discuss further here)
	dim3 THREADS_IN_BLOCK(64, 1);

	//create BLOCKS_IN_GRID variable and declare the number of blocks to use.  
	//variable type is dim3, which basically means there are 3 dimensions to this variable (no need to discuss further here)
	dim3 BLOCKS_IN_GRID(1, 1);

	//print out the values used for block and grid
	printf("number of blocks %d | number of threads in each block %d\n", BLOCKS_IN_GRID.x, THREADS_IN_BLOCK.x);


	//END create variables for the CUDA kernel launch





	//START create streams for cuda kernel launches

	//declare stream variables
	cudaStream_t stream1;
	cudaStream_t stream2;
	cudaStream_t stream3;
	cudaStream_t stream4;
	cudaStream_t stream5;
	cudaStream_t stream6;
	cudaStream_t stream7;
	cudaStream_t stream8;
	cudaStream_t stream9;
	cudaStream_t stream10;
	cudaStream_t stream11;
	cudaStream_t stream12;
	cudaStream_t stream13;
	cudaStream_t stream14;
	cudaStream_t stream15;
	cudaStream_t stream16;
	cudaStream_t stream17;
	cudaStream_t stream18;
	cudaStream_t stream19;
	cudaStream_t stream20;
	cudaStream_t stream21;
	cudaStream_t stream22;

	//create stream
	cudaStreamCreate(&stream1);
	cudaStreamCreate(&stream2);
	cudaStreamCreate(&stream3);
	cudaStreamCreate(&stream4);
	cudaStreamCreate(&stream5);
	cudaStreamCreate(&stream6);
	cudaStreamCreate(&stream7);
	cudaStreamCreate(&stream8);
	cudaStreamCreate(&stream9);
	cudaStreamCreate(&stream10);
	cudaStreamCreate(&stream11);
	cudaStreamCreate(&stream12);
	cudaStreamCreate(&stream13);
	cudaStreamCreate(&stream14);
	cudaStreamCreate(&stream15);
	cudaStreamCreate(&stream16);
	cudaStreamCreate(&stream17);
	cudaStreamCreate(&stream18);
	cudaStreamCreate(&stream19);
	cudaStreamCreate(&stream20);
	cudaStreamCreate(&stream21);
	cudaStreamCreate(&stream22);


	//END create streams for cuda kernel launches


	//declare shared memory variable
	//not relevant in this example, but have to fill in a value when launching kernel with streams
	int my_shared_memory = 0;


	//make sure all preceding tasks on the device (GPU) has been completed before proceeding with new business.  wouldn't want to mix ingredients from two completely different recipes now would you?!?!
	cudaDeviceSynchronize();

	//"start" event marker.  this is simply a marker in the code
	cudaEventRecord(start);


	//launch the CUDA kernel
	//take note of the launch parameters for the kernels	
	

	//several kernel launches on multiple streams at the same time.  this is where kernel concurrency happens
	
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream1 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream2 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream3 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream4 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream5 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream6 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream7 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream8 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream9 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream10 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream11 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream12 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream13 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream14 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream15 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream16 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream17 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream18 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream19 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream20 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream21 >> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream22 >> > ();
	
	


	
	//several kernel launches on the null stream (the default stream).  sequential processing.  kernels are launched and completed one after the other. no kernel concurrency
	/*
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();
	*/

	//"stop" event marker.  this is a simply a marker in the code.  it doesn't not guarantee that everything preceding the marker has finished processing or is concluded.
	cudaEventRecord(stop);



	//check to make sure all preceding business before the "stop" marker is complete before proceeding
	//remember that by default the CPU will launch the kernel on the device (GPU) and immediately continue processing other instructions.
	//we use cudaEventSynchronize(stop) to tell the CPU to halt further processing until all preceding business before the stop event flag has finished processing or is concluded.
	cudaEventSynchronize(stop);


	//now that we can guarantee that all preceding business before the "stop" marker is finished, we can calculate and print the amount of time the event took
	float milliseconds = 0;
	cudaEventElapsedTime(&milliseconds, start, stop);
	printf("DEVICE elapsed time: %f milliseconds\n", milliseconds);

	
	
	//practice good housekeeping by resetting the device and destroying event variables when you are done
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaDeviceReset();
	

	//CUDA EDUCATION
	//Website: cudaeducation.com
	//Twitter: @cudaeducation
	//Email: cudaeducation@gmail.com
	//YouTube: Cuda Education
	//Slack: https://bit.ly/2NBBG4h | Join the workspace
	//Donate: Visit cudaeducation.com to donate to the cause



}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

//CUDA Streams | A very simple demonstration of cuda streams

//Website: cudaeducation.com

//Company: CUDA Education | cudaeducation.com | cudaeducation@gmail.com | Please donate at cudaeducation.com

//YouTube Channel (please subscribe): Cuda Education | https://www.youtube.com/channel/UCzpwNg0Ai8zCzbsEtozkfFQ

//Twitter: @cudaeducation

//Slack: https://bit.ly/2NBBG4h | Join the workspace

//Have questions? Comment on the YouTube channel https://www.youtube.com/channel/UCzpwNg0Ai8zCzbsEtozkfFQ or message on Twitter

//Donate: Visit cudaeducation.com to donate

//DISCLAIMER: This code is for teaching purposes only! CUDA Education does not guarantee the accuracy of this code in any way. This code should not be used in a production or commercial environment. Any liabilities or loss resulting from the use of this code, in whole or in part, will not be the responsibility of CUDA Education.

//CODING ENVIRONMENT:

//CUDA Toolkit 9.0

//Windows environment

//Visual Studio 2017 Community Edition

//nVidia GeForce 1050 ti Graphics Card

//Compute Capability 6.1

//make sure to include all relevant libraries

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <windows.h>

#include <stdio.h>

//function that will run on the GPU

__global__ void cuda_education_function_on_CUDA_GPU()

{

//some pointless processing just to make sure the function is actually called

int gpudummy = 0;

for (int i = 0; i < 100000; i++)

{

gpudummy = gpudummy + 1;

gpudummy = gpudummy - 1;

}

//CUDA EDUCATION

//Website: cudaeducation.com

//Twitter: @cudaeducation

//Email: cudaeducation@gmail.com

//YouTube: Cuda Education | Please subscribe

//Slack: https://bit.ly/2NBBG4h | Join the workspace

//Donate: Visit cudaeducation.com to donate

}

int main(int argc, char **argv)

{

//get the nVidia GPU running CUDA ready

int cuda_education_device = 0;

//set the device to be used for CUDA execution

cudaSetDevice(cuda_education_device);

//set up the CUDA timer for tracking how long it takes the kernel on the device (GPU) to do its job

//for more information on this, check out https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/

cudaEvent_t start, stop;

cudaEventCreate(&start);

cudaEventCreate(&stop);

//START create variables for the CUDA kernel launch

//create THREADS_IN_BLOCK variable and declare the number of threads that should be in each block.

//variable type is dim3, which basically means there are 3 dimensions to this variable (no need to discuss further here)

dim3 THREADS_IN_BLOCK(64, 1);

//create BLOCKS_IN_GRID variable and declare the number of blocks to use.

//variable type is dim3, which basically means there are 3 dimensions to this variable (no need to discuss further here)

dim3 BLOCKS_IN_GRID(1, 1);

//print out the values used for block and grid

printf("number of blocks %d | number of threads in each block %d\n", BLOCKS_IN_GRID.x, THREADS_IN_BLOCK.x);

//END create variables for the CUDA kernel launch

//START create streams for cuda kernel launches

//declare stream variables

cudaStream_t stream1;

cudaStream_t stream2;

cudaStream_t stream3;

cudaStream_t stream4;

cudaStream_t stream5;

cudaStream_t stream6;

cudaStream_t stream7;

cudaStream_t stream8;

cudaStream_t stream9;

cudaStream_t stream10;

cudaStream_t stream11;

cudaStream_t stream12;

cudaStream_t stream13;

cudaStream_t stream14;

cudaStream_t stream15;

cudaStream_t stream16;

cudaStream_t stream17;

cudaStream_t stream18;

cudaStream_t stream19;

cudaStream_t stream20;

cudaStream_t stream21;

cudaStream_t stream22;

//create stream

cudaStreamCreate(&stream1);

cudaStreamCreate(&stream2);

cudaStreamCreate(&stream3);

cudaStreamCreate(&stream4);

cudaStreamCreate(&stream5);

cudaStreamCreate(&stream6);

cudaStreamCreate(&stream7);

cudaStreamCreate(&stream8);

cudaStreamCreate(&stream9);

cudaStreamCreate(&stream10);

cudaStreamCreate(&stream11);

cudaStreamCreate(&stream12);

cudaStreamCreate(&stream13);

cudaStreamCreate(&stream14);

cudaStreamCreate(&stream15);

cudaStreamCreate(&stream16);

cudaStreamCreate(&stream17);

cudaStreamCreate(&stream18);

cudaStreamCreate(&stream19);

cudaStreamCreate(&stream20);

cudaStreamCreate(&stream21);

cudaStreamCreate(&stream22);

//END create streams for cuda kernel launches

//declare shared memory variable

//not relevant in this example, but have to fill in a value when launching kernel with streams

int my_shared_memory = 0;

//make sure all preceding tasks on the device (GPU) has been completed before proceeding with new business. wouldn't want to mix ingredients from two completely different recipes now would you?!?!

cudaDeviceSynchronize();

//"start" event marker. this is simply a marker in the code

cudaEventRecord(start);

//launch the CUDA kernel

//take note of the launch parameters for the kernels

//several kernel launches on multiple streams at the same time. this is where kernel concurrency happens

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream1 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream2 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream3 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream4 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream5 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream6 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream7 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream8 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream9 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream10 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream11 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream12 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream13 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream14 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream15 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream16 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream17 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream18 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream19 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream20 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream21 >> > ();

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK, my_shared_memory, stream22 >> > ();

//several kernel launches on the null stream (the default stream). sequential processing. kernels are launched and completed one after the other. no kernel concurrency

cuda_education_function_on_CUDA_GPU << <BLOCKS_IN_GRID, THREADS_IN_BLOCK>> > ();

//"stop" event marker. this is a simply a marker in the code. it doesn't not guarantee that everything preceding the marker has finished processing or is concluded.

cudaEventRecord(stop);

//check to make sure all preceding business before the "stop" marker is complete before proceeding

//remember that by default the CPU will launch the kernel on the device (GPU) and immediately continue processing other instructions.

//we use cudaEventSynchronize(stop) to tell the CPU to halt further processing until all preceding business before the stop event flag has finished processing or is concluded.

cudaEventSynchronize(stop);

//now that we can guarantee that all preceding business before the "stop" marker is finished, we can calculate and print the amount of time the event took

float milliseconds = 0;

cudaEventElapsedTime(&milliseconds, start, stop);

printf("DEVICE elapsed time: %f milliseconds\n", milliseconds);

//practice good housekeeping by resetting the device and destroying event variables when you are done

cudaEventDestroy(start);

cudaEventDestroy(stop);

cudaDeviceReset();

//CUDA EDUCATION

//Website: cudaeducation.com

//Twitter: @cudaeducation

//Email: cudaeducation@gmail.com

//YouTube: Cuda Education

//Slack: https://bit.ly/2NBBG4h | Join the workspace

//Donate: Visit cudaeducation.com to donate to the cause

}

Donate

NVIDIA CUDA / GPU Programming | Tutorial

Posted on October 21, 2018 by admin

Install TensorFlow + TensorBoard + Keras + Anaconda Python in Windows | Video Walkthrough (19+ min.)

A video walkthrough of how to go about installing TensorFlow + TensorBoard + Keras + Anaconda Python on a Windows based machine. The video walkthrough is 19+ minutes long.

Installation | Tensorflow | Tutorial

Posted on October 12, 2018 by admin

Tensorflow List of Devices | Tensorflow Anaconda Python Recognized Devices | Cuda Education

A quick overview of how to know what devices Tensorflow sees on your system in the video. Code snippet should be used in Python Anaconda.

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

1 2	from tensorflow.python.client import device_lib print(device_lib.list_local_devices())

Next: Install Tensorflow + Tensorboard + Keras + Anaconda Python in Windows

Tensorflow | Tutorial

Posted on October 7, 2018 by admin

Video Walkthrough (32+ min.) of how to use CUDA Cooperative Groups | cooperative_groups | tiled_partition | coalesced_threads | thread_rank | thread_block | Includes Example Code

Learn how to use cooperative groups to make your parallel processing code more organized and manageable. The video walkthrough is 32+ minutes long and includes example source code.

NVIDIA CUDA / GPU Programming | Tutorial

Posted on September 29, 2018 by admin

CUDA Threads Blocks Grids EXPLAINED | GPU Programming | NVIDIA Programming | Video Walkthrough (46 minutes)

NVIDIA CUDA / GPU Programming | Tutorial

Posted on September 23, 2018 by admin

#error — unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported! | Line 133 | Line 707 | CUDA Error

Donate If you are getting the following error when running CUDA in Visual Studio 2017: then all you have to do is go to the menu bar at the top Project -> [your project name] Properties -> General […]

Error | NVIDIA CUDA / GPU Programming | Tutorial

Posted on September 11, 2018 by admin

Transfer Data from Device to Host using CUDA | CUDA Education | cudaMalloc | CUDA Tutorial |

A very simple example of transferring data from the device (GPU) to the host (CPU) using CUDA. If you have any questions, contact me on twitter @cudaeducation or comment on the YouTube video. Download code: CUDA Transfer Data from Host to […]

NVIDIA CUDA / GPU Programming | Tutorial

Posted on September 3, 2018 by admin

GPU vs CPU Programming | CUDA Programming Introduction | nVidia CUDA Overview | YouTube Video Walkthrough

A quick comparison of GPU programming vs. CPU programming and the battles one has to face when programming in parallel.

NVIDIA CUDA / GPU Programming | Tutorial

Posted on August 10, 2018 by admin

CUDA Programming Example | 20 Million Array Addition | CUDA Tutorial | CUDA Array Addition [UPDATED]

The following code is basically taking an array with 20 million integers and adding all the numbers together to get a final answer. I have heavily commented the code for your convenience. CREDIT: Professional CUDA C Programming by John […]

NVIDIA CUDA / GPU Programming | Tutorial

Posted on April 22, 2018 by admin

Use at your own risk! This code and/or instructions are for teaching purposes only. CUDA Education does not guarantee the accuracy of this code in any way. The code and instructions on this site may cause hardware damage and/or instability in your system. This code and/or instructions should not be used in a production or commercial environment.

Any liabilities or loss resulting from the use of this code and/or instructions, in whole or in part, will not be the responsibility of CUDA Education. All rights reserved. This code is the property of CUDA Education. Please contact CUDA Education at cudaeducation@gmail.com if you would like to use this code in any way, shape or form.