A very simple example of transferring data from the device (GPU) to the host (CPU) using CUDA. If you have any questions, contact me on twitter @cudaeducation or comment on the YouTube video.
Download code: CUDA Transfer Data from Host to Device
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
//CUDA Programming Example | Simple Transfer Data from GPU to CPU //Company: CUDA Education | cudaeducation.com | cudaeducation@gmail.com | Please donate at cudaeducation.com //YouTube Channel (please subscribe): https://www.youtube.com/channel/UCzpwNg0Ai8zCzbsEtozkfFQ //Twitter: @cudaeducation //Have questions? Comment on the YouTube channel https://www.youtube.com/channel/UCzpwNg0Ai8zCzbsEtozkfFQ contact me on Twitter @cudaeducation or email cudaeducation@gmail.com //DISCLAIMER: This code is for teaching purposes only! CUDA Education does not guarantee the accuracy of this code in any way. This code should not be used in a production or commercial environment. Any liabilities or loss resulting from the use of this code, in whole or in part, will not be the responsibility of CUDA Education. //All rights reserved. This code is the property of CUDA Education. Please contact CUDA Education at cudaeducation@gmail.com if you would like to use this code in any way, shape or form. //CODING ENVIRONMENT: //CUDA Toolkit 9.0 //Windows environment //Visual Studio 2017 Community Edition //nVidia GeForce 1050 ti Graphics Card #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> __global__ void cuda_education_syncthreads_kernel(int *device_holder) { device_holder[threadIdx.x] = threadIdx.x; } int main() { int cuda_education_device = 0; //set the device to be used for CUDA execution cudaSetDevice(cuda_education_device); //declare the number of threads to use //on my GeForce 1050 ti the maximum number I can use is 1024 //the number is dictated by the amount of threads one block can hold on your GPU //for your GPU it might be different (lower or higher) //i use 256 here just to be safe and accomodate many people's GPU int number_of_threads = 256; //initialize a variable on the device side that will send data back to the host //we will print this data to the command prompt int *device_variable_to_send_data_back_to_host = NULL; //declare how much memory we anticipate device_variable_to_send_data_back_to_host to use size_t memory_space = number_of_threads * sizeof(int); //set up memory allocation for device_variable_to_send_data_back_to_host cudaMalloc((void **)&device_variable_to_send_data_back_to_host, memory_space); //declare kernel launch variables //notice I only use one block for simplicity dim3 threadblocks(number_of_threads); dim3 grid(1); //launch the kernel cuda_education_syncthreads_kernel << <grid, threadblocks >> > (device_variable_to_send_data_back_to_host); //make sure the host does not proceed before the kernel finishes its business cudaDeviceSynchronize(); //create a host variable to receive data from device int *host_variable_receive_data_from_device; //allocate memory space for the host variable host_variable_receive_data_from_device = (int *)malloc(memory_space); //copy the data from device to host cudaMemcpy(host_variable_receive_data_from_device, device_variable_to_send_data_back_to_host, memory_space, cudaMemcpyDeviceToHost); for (int i = 0; i < number_of_threads; i++) { printf("%d \n", host_variable_receive_data_from_device[i]); } //practice good housekeeping by resetting the device when you are done cudaDeviceReset(); return 0; } |
Next: Using CUDA Pinned Memory for faster transfers https://cudaeducation.com/cudapinnedmemory/
Leave a Reply