CUDA Tutorial 

Demo 1: Compiling and running vecadd.cu with code walk-through

nvcc vecadd.cu    # login host
./vecadd          # login host and compute host
[arnoldg@ac sdk]$ cat -n  vecadd.cu 
  1     //  Kernel definition, see also section 4.2.3 of Nvidia Cuda Programming Guide 
  2     __global__  void vecAdd(float* A, float* B, float* C) 
  3     { 
  4        // threadIdx.x is a built-in variable  provided by CUDA at runtime 
  5        int i = threadIdx.x; 
  6        A[i]=0; 
  7        B[i]=i; 
  8        C[i] = A[i] + B[i]; 
  9     } 
  10     
  11     #include  <stdio.h> 
  12     #define  SIZE 10 
  13     int  main() 
  14     { 
  15         int N=SIZE; 
  16         float A[SIZE], B[SIZE], C[SIZE]; 
  17         float *devPtrA; 
  18         float *devPtrB; 
  19         float *devPtrC; 
  20         int memsize= SIZE * sizeof(float); 
  21     
  22         cudaMalloc((void**)&devPtrA, memsize); 
  23         cudaMalloc((void**)&devPtrB, memsize); 
  24         cudaMalloc((void**)&devPtrC, memsize); 
  25         cudaMemcpy(devPtrA, A, memsize,  cudaMemcpyHostToDevice); 
  26         cudaMemcpy(devPtrB, B, memsize,  cudaMemcpyHostToDevice); 
  27         // __global__ functions are called:  Func<<< Dg, Db, Ns  >>>(parameter); 
  28         vecAdd<<<1, N>>>(devPtrA,  devPtrB, devPtrC); 
  29         cudaMemcpy(C, devPtrC, memsize,  cudaMemcpyDeviceToHost); 
  30     
  31         for (int i=0; i<SIZE; i++) 
  32          printf("C[%d]=%f\n",i,C[i]); 
  33     
  34          cudaFree(devPtrA); 
  35         cudaFree(devPtrA); 
  36         cudaFree(devPtrA); 
  37     } 
  [arnoldg@ac sdk]$ 

This is the output from the abe cluster:

[arnoldg@abe1391  ~/sdk2.0]$ ./vecadd 
  C[0]=0.000000 
  C[1]=1.000000 
  C[2]=2.000000 
  C[3]=3.000000 
  C[4]=4.000000 
  C[5]=5.000000 
  C[6]=6.000000 
  C[7]=7.000000 
  C[8]=8.000000 
  C[9]=9.000000 
  [arnoldg@abe1391  ~/sdk2.0]$ vi vecadd.cu  # comment out  kernel funtion call 
  [arnoldg@abe1391  ~/sdk2.0]$ nvcc vecadd.cu 
  vecadd.cu(15): warning:  variable "N" was declared but never referenced 
  vecadd.cu(15): warning:  variable "N" was declared but never referenced 

Note the kernel was commented.  It doesn't run in the case below, but the output still appears to be correct.  Why ?

[arnoldg@abe1391  ~/sdk2.0]$ ./a.out 
  C[0]=0.000000 
  C[1]=1.000000 
  C[2]=2.000000 
  C[3]=3.000000 
  C[4]=4.000000 
  C[5]=5.000000 
  C[6]=6.000000 
  C[7]=7.000000 
  C[8]=8.000000 
  C[9]=9.000000 
  [arnoldg@abe1391  ~/sdk2.0]$ 

Running the program on a login node of abe yields:

[arnoldg@honest3  ~/sdk2.0]$ ./vecadd 
  NVIDIA: could not open the  device file /dev/nvidiactl (No such device or address). 
  C[0]=-0.000548 
  C[1]=0.000000 
  C[2]=0.000000 
  C[3]=0.000000 
  C[4]=0.000000 
  C[5]=0.000000 
  C[6]=0.000000 
  C[7]=0.000000 
  C[8]=0.000000 
  C[9]=0.000000 

Question: The program ran, emitted a warning [to stderr, not stdout] and produced incorrect results.  Why ? 

Hints: ls -l /dev/nvidia* , /sbin/lsmod | grep nvidia

Answer: A CUDA program may run without a device and yield unexpected results.  It's important to check for an Nvidia device within the program.  Program defensively.

demo 2: Add code to vecadd.cu to detect an Nvidia device.  Exit if none is found.

[arnoldg@honest3  ~/sdk2.0]$ cat -n vecadd_devdetect.cu 
  1     //  Kernel definition 
  2     __global__  void vecAdd(float* A, float* B, float* C) 
  3     { 
  4        int i = threadIdx.x; 
  5        A[i]=0; 
  6        B[i]=i; 
  7        C[i] = A[i] + B[i]; 
  8     } 
  9     
  10     #include  <stdio.h> 
  11     #define  SIZE 10 
  12     int  main() 
  13     { 
  14         int devcheck(int); 
  15         devcheck(0); 
  16     
  17         int N=SIZE; 
  18         float A[SIZE], B[SIZE], C[SIZE]; 
  19         // Kernel invocation 
  20     
  21         float *devPtrA; 
  22         float *devPtrB; 
  23         float *devPtrC; 
  24         int memsize= SIZE * sizeof(float); 
  25     
  26         cudaMalloc((void**)&devPtrA, memsize); 
  27         cudaMalloc((void**)&devPtrB, memsize); 
  28         cudaMalloc((void**)&devPtrC, memsize); 
  29         cudaMemcpy(devPtrA, A, memsize,  cudaMemcpyHostToDevice); 
  30         cudaMemcpy(devPtrB, B, memsize,  cudaMemcpyHostToDevice); 
  31         vecAdd<<<1, N>>>(devPtrA,  devPtrB, devPtrC); 
  32         cudaMemcpy(C, devPtrC, memsize,  cudaMemcpyDeviceToHost); 
  33     
  34         for (int i=0; i<SIZE; i++) 
  35          printf("C[%d]=%f\n",i,C[i]); 
  36     
  37         cudaFree(devPtrA); 
  38         cudaFree(devPtrA); 
  39         cudaFree(devPtrA); 
  40     } 
  41     
  42     int  devcheck(int gpudevice) 
  43     { 
  44         int device_count=0; 
  45          int device;  // used with  cudaGetDevice() to verify cudaSetDevice() 
  46     
  47         // get the number of non-emulation devices  detected 
  48         cudaGetDeviceCount( &device_count); 
  49         if (gpudevice > device_count) 
  50         { 
  51             printf("gpudevice >=  device_count ... exiting\n"); 
  52             exit(1); 
  53         } 
  54         cudaError_t cudareturn; 
  55         cudaDeviceProp deviceProp; 
  56     
  57         // cudaGetDeviceProperties() is also  demonstrated in the deviceQuery/ example
  58         // of the sdk projects directory 
  59         cudaGetDeviceProperties(&deviceProp,  gpudevice); 
  60          printf("[deviceProp.major.deviceProp.minor] = [%d.%d]\n", 
  61            deviceProp.major, deviceProp.minor); 
  62     
  63         if (deviceProp.major > 999) 
  64         { 
  65             printf("warning, CUDA Device  Emulation (CPU) detected, exiting\n"); 
  66                     exit(1); 
  67         } 
  68     
  69         // choose a cuda device for kernel  execution 
  70         cudareturn=cudaSetDevice(gpudevice); 
  71         if (cudareturn == cudaErrorInvalidDevice) 
  72         { 
  73             perror("cudaSetDevice returned  cudaErrorInvalidDevice"); 
  74         } 
  75         else 
  76         { 
  77             // double check that device was  properly selected 
  78             cudaGetDevice(&device); 
  79              printf("cudaGetDevice()=%d\n",device); 
  80         } 
  81     } 
  [arnoldg@honest3  ~/sdk2.0]$ 

After compiling that version, the program exits when running without a device.

[arnoldg@honest3  ~/sdk2.0]$ ./vecadd_devdetect 
  NVIDIA: could not open the  device file /dev/nvidiactl (No such device or address). 
  [deviceProp.major.deviceProp.minor]  = [9999.9999] 
  warning, CUDA Device  Emulation (CPU) detected, exiting 
  [arnoldg@honest3  ~/sdk2.0]$ 

$ nvcc –help  # There are some rather unique options to the compiler.

[arnoldg@honest3  ~/sdk2.0]$ nvcc --keep --opencc-options -LIST:source=on vecadd.cu
  [arnoldg@honest3  ~/sdk2.0]$ cat vecadd.ptx
  1           .version 1.2
  2           .target sm_10, map_f64_to_f32
  3           // compiled with /usr/local/cuda/open64/lib//be
  4           // nvopencc built on 2008-06-19
  5
  6           .reg .u32 %ra<17>;
  7           .reg .u64 %rda<17>;
  8           .reg .f32 %fa<17>;
  9           .reg .f64 %fda<17>;
  10           .reg .u32 %rv<5>;
  11           .reg .u64 %rdv<5>;
  12           .reg .f32 %fv<5>;
  13           .reg .f64 %fdv<5>;
  14
  15
  16           //-----------------------------------------------------------
  17           // Compiling vecadd.cpp3.i (/tmp/ccBI#.UWx7Mj)
  18           //-----------------------------------------------------------
  19
  20           //-----------------------------------------------------------
  21           // Options:
  22           //-----------------------------------------------------------
  23           //  Target:ptx, ISA:sm_10,  Endian:little, Pointer Size:64
  24           //  -O3 (Optimization level)
  25           //  -g0 (Debug level)
  26           //  -m2 (Report advisories)
  27           //-----------------------------------------------------------
  28
  29           .file   1       "vecadd.cudafe2.gpu"
  30           .file   2       "/usr/lib/gcc/x86_64-redhat-linux/3.4.6/include/stddef.h"
  31           .file   3        "/usr/local/cuda/bin/../include/crt/device_runtime.h"
  32           .file   4        "/usr/local/cuda/bin/../include/crt/../host_defines.h"
  33           .file   5       "/usr/local/cuda/bin/../include/crt/../builtin_types.h"
  34           .file   6        "/usr/local/cuda/bin/../include/crt/../device_types.h"
  35           .file   7        "/usr/local/cuda/bin/../include/crt/../driver_types.h"
  36           .file   8       "/usr/local/cuda/bin/../include/crt/../texture_types.h"
  37           .file   9        "/usr/local/cuda/bin/../include/crt/../vector_types.h"
  38           .file   10       "/usr/local/cuda/bin/../include/crt/../device_launch_parameters.h"
  39           .file   11      "/usr/local/cuda/bin/../include/crt/storage_class.h"
  40           .file   12      "/usr/include/bits/types.h"
  41           .file   13      "/usr/include/time.h"
  42           .file   14      "vecadd.cu"
  43           .file   15      "/usr/local/cuda/bin/../include/common_functions.h"
  44          .file   16       "/usr/local/cuda/bin/../include/crt/func_macro.h"
  45           .file   17       "/usr/local/cuda/bin/../include/math_functions.h"
  46           .file   18      "/usr/local/cuda/bin/../include/device_functions.h"
  47           .file   19       "/usr/local/cuda/bin/../include/math_constants.h"
  48           .file   20       "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
  49           .file   21      "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
  50           .file   22       "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
  51           .file   23       "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
  52           .file   24      "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
  53
  54
  55           .entry __globfunc__Z6vecAddPfS_S_
  56           {
  57           .reg .u32 %r<3>;
  58           .reg .u64 %rd<10>;
  59           .reg .f32 %f<6>;
  60           .param .u64 __cudaparm___globfunc__Z6vecAddPfS_S__A;
  61           .param .u64 __cudaparm___globfunc__Z6vecAddPfS_S__B;
  62           .param .u64 __cudaparm___globfunc__Z6vecAddPfS_S__C;
  63           .loc    14      2        0
  64    //   1  // Kernel definition
  65    //   2  __global__ void vecAdd(float* A, float* B,  float* C)
  66   $LBB1___globfunc__Z6vecAddPfS_S_:
  67           .loc    14      5        0
  68    //   3  {
  69    //   4     int i = threadIdx.x;
  70    //   5     A[i]=0; 
  71          cvt.s32.u16     %r1, %tid.x;            // 
  72           cvt.u64.s32     %rd1, %r1;              // 
  73           mul.lo.u64      %rd2, %rd1,  4;          // 
  74           ld.param.u64    %rd3,  [__cudaparm___globfunc__Z6vecAddPfS_S__A];// id:22  __cudaparm___globfunc__Z6vecAddPfS_S__A+0x0
  75           add.u64         %rd4, %rd3,  %rd2;       // 
  76           mov.f32         %f1,  0f00000000;        // 0
  77           st.global.f32   [%rd4+0],  %f1;  // id:23
  78           .loc    14      6        0
  79    //   6     B[i]=i;
  80           cvt.rn.f32.s32  %f2, %r1;       // 
  81           ld.param.u64    %rd5,  [__cudaparm___globfunc__Z6vecAddPfS_S__B];// id:24  __cudaparm___globfunc__Z6vecAddPfS_S__B+0x0
  82           add.u64         %rd6, %rd5, %rd2;       // 
  83           st.global.f32   [%rd6+0],  %f2;  // id:25
  84           .loc    14      7        0
  85    //   7     C[i] = A[i] + B[i];
  86           ld.global.f32   %f3,  [%rd4+0];  // id:26
  87           add.f32         %f4, %f3, %f2;          // 
  88           ld.param.u64    %rd7,  [__cudaparm___globfunc__Z6vecAddPfS_S__C];// id:27  __cudaparm___globfunc__Z6vecAddPfS_S__C+0x0
  89           add.u64         %rd8, %rd7,  %rd2;       // 
  90           st.global.f32   [%rd8+0],  %f4;  // id:28
  91           .loc    14      8        0
  92    //   8  }
  93           exit;                           // 
  94   $LDWend___globfunc__Z6vecAddPfS_S_:
  95           } // __globfunc__Z6vecAddPfS_S_
  96
  [arnoldg@honest3 ~/sdk2.0]$  ls
  

The collection of intermediate files can be cleared away with the addition of the -clean flag.

See also ptx_isa_1.2.pdf or similar in /usr/local/cuda/doc/ .  Ch. 2, Ch. 4

[arnoldg@honest3  ~/sdk2.0]$ nvcc --ptxas-options=-v vecadd.cu 
  ptxas info    : Compiling entry function  '__globfunc__Z6vecAddPfS_S_' 
  ptxas info    : Used 4 registers, 40+32 bytes smem 
  [arnoldg@honest3  ~/sdk2.0]$ 

CUDA 2.1 FAQ

scroll down to: Programming Questions 4, 29, 30

Question:
Vecadd revealed via gdb:  The sample program can be run in device emulation mode on a system without an Nvidia device and driver loaded for debugging purposes.  The host emulation mode for CUDA tries to mimic the device by using pthreads where the device would run hardware threads.  What might the device and driver need in order to support a rich gdb-like debugging environment? 

Hint: more o/s in the driver for break/interrupts, more silicon on the device and additional instructions in CUDA ptx assembler...

[arnoldg@ac sdk]$ nvcc -g  --device-emulation -o vecadd_dbg vecadd.cu 
  [arnoldg@ac sdk]$  ./vecadd_dbg   # compare with running  this on host w/ device
  NVIDIA: could not open the  device file /dev/nvidiactl (No such file or directory). 
  C[0]=0.000000 
  C[1]=1.000000 
  C[2]=2.000000 
  C[3]=3.000000 
  C[4]=4.000000 
  C[5]=5.000000 
  C[6]=6.000000 
  C[7]=7.000000 
  C[8]=8.000000 
  C[9]=9.000000 
  [arnoldg@ac sdk]$ gdb  ./vecadd_dbg 
  GNU gdb Fedora  (6.8-23.fc9) 
  Copyright (C) 2008 Free  Software Foundation, Inc. 
  License GPLv3+: GNU GPL  version 3 or later <http://gnu.org/licenses/gpl.html> 
  This is free software: you  are free to change and redistribute it. 
  There is NO WARRANTY, to  the extent permitted by law.  Type  "show copying" 
  and "show  warranty" for details. 
  This GDB was configured as  "x86_64-redhat-linux-gnu"... 
  (gdb) l 
  6       B[i]=i; 
  7       C[i] = A[i] + B[i]; 
  8    } 
  9    
  10   
  11   #include <stdio.h> 
  12   #define SIZE 10 
  13   int main() 
  14   { 
  15           int device_count =  0; (gdb) l 16           cudaGetDeviceCount(  &device_count ); 
  17   
  18       int N=SIZE; 
  19       float A[SIZE], B[SIZE],  C[SIZE]; 
  20       // Kernel invocation 
  21   
  22       float *devPtrA; 
  23       float *devPtrB; 
  24       float *devPtrC; 
  25       int memsize= SIZE *  sizeof(float); 
  (gdb) l 
  26   
  27        cudaMalloc((void**)&devPtrA, memsize); 
  28        cudaMalloc((void**)&devPtrB, memsize); 
  29        cudaMalloc((void**)&devPtrC, memsize); 
  30       cudaMemcpy(devPtrA, A,  memsize, cudaMemcpyHostToDevice); 
  31       cudaMemcpy(devPtrB, B,  memsize, cudaMemcpyHostToDevice); 
  32       vecAdd<<<1,  N>>>(devPtrA, devPtrB, devPtrC); 
  33       cudaMemcpy(C, devPtrC,  memsize, cudaMemcpyDeviceToHost); 
  34   
  35       for (int i=0; i<SIZE;  i++) 
  (gdb) l 
  36         printf("C[%d]=%f\n",i,C[i]); 
  37   
  38       cudaFree(devPtrA); 
  39       cudaFree(devPtrA); 
  40       cudaFree(devPtrA); 
  41   } 
  42   
  (gdb) b 32 
  Breakpoint 1 at 0x40b77c:  file vecadd.cu, line 32. 
  (gdb) run Starting program:  /home/ac/arnoldg/sdk/vecadd_dbg 
  [Thread debugging using  libthread_db enabled] 
  [New Thread 0x7f8dde6f7700  (LWP 6750)] 
  NVIDIA: could not open the  device file /dev/nvidiactl (No such file or directory). 
  [New Thread 0x400c7950  (LWP 6753)] 
  [New Thread 0x41e26950  (LWP 6754)] 
  [New Thread 0x40ef2950  (LWP 6755)] 
  [New Thread 0x40d78950  (LWP 6756)] 
Breakpoint 1, main () at  vecadd.cu:32 
  32       vecAdd<<<1,  N>>>(devPtrA, devPtrB, devPtrC); 
  Current language:  auto; currently c++ 
  Missing separate  debuginfos, use: debuginfo-install gcc.x86_64 glibc.x86_64 zlib.x86_64 
  (gdb) s tep
  dim3 (this=0x7fffe67100a0,  x=10, y=1, z=1) 
  at  /usr/local/cuda/bin/../include/vector_types.h:426 
  426      dim3(unsigned int x = 1,  unsigned int y = 1, unsigned int z = 1) : x(x), y(y), z(z) {} 
  (gdb) s 
  dim3 (this=0x7fffe67100b0,  x=1, y=1, z=1) 
  at /usr/local/cuda/bin/../include/vector_types.h:426 
  426      dim3(unsigned int x = 1,  unsigned int y = 1, unsigned int z = 1) : x(x), y(y), z(z) {} 
  (gdb) s
  __device_stub__Z6vecAddPfS_S_  (__par0=0x7f8dd8000b00, __par1=0x7f8dd8000c00, 
  __par2=0x7f8dd8000d00) 
  at /tmp/tmpxft_00001a1f_00000000-1_vecadd.cudafe1.stub.c:13 
  13   /tmp/tmpxft_00001a1f_00000000-1_vecadd.cudafe1.stub.c: No such  file or directory. 
  in /tmp/tmpxft_00001a1f_00000000-1_vecadd.cudafe1.stub.c 
  (gdb) s
  cudaLaunch<char>  (symbol=0x407ab2 "UH\211�H\201�0\001") 
  at  /usr/local/cuda/bin/../include/cuda_runtime.h:327 
  327    return cudaLaunch((const  char*)symbol); 
  (gdb) step
  [New Thread 0x411d9950  (LWP 6757)] 
  [New Thread 0x41d27950  (LWP 6758)] 
  [New Thread 0x414b8950  (LWP 6759)] 
  [New Thread 0x41f03950  (LWP 6760)] 
  [New Thread 0x403ad950  (LWP 6761)] 
  [New Thread 0x40cbc950  (LWP 6762)] 
  [New Thread 0x41923950  (LWP 6763)] 
  [New Thread 0x40a5a950  (LWP 6764)] 
  [New Thread 0x405ac950  (LWP 6765)] 
  [New Thread 0x41c9a950  (LWP 6766)] 
  [Thread 0x403ad950 (LWP  6761) exited] 
  [Thread 0x40cbc950 (LWP  6762) exited] 
  [Thread 0x405ac950 (LWP  6765) exited] 
  [Thread 0x40a5a950 (LWP  6764) exited] 
  [Thread 0x41923950 (LWP  6763) exited] 
  [Thread 0x414b8950 (LWP  6759) exited] 
  [Thread 0x41d27950 (LWP  6758) exited] 
  [Thread 0x41c9a950 (LWP 6766)  exited] 
  [Thread 0x411d9950 (LWP  6757) exited] 
  [Thread 0x41f03950 (LWP  6760) exited] 
  328  } 
  (gdb) s 
  main () at vecadd.cu:33 
  33       cudaMemcpy(C, devPtrC,  memsize, cudaMemcpyDeviceToHost); 
  (gdb) n 
  35       for (int i=0; i<SIZE;  i++) 
  (gdb) p C 
  $2 = {0, 1, 2, 3, 4, 5, 6,  7, 8, 9} 
  (gdb) c 
  Continuing. 
  C[0]=0.000000 
  C[1]=1.000000 
  C[2]=2.000000 
  C[3]=3.000000 
  C[4]=4.000000 
  C[5]=5.000000 
  C[6]=6.000000 
  C[7]=7.000000 
  C[8]=8.000000 
  C[9]=9.000000 
  [Thread 0x41e26950 (LWP  6754) exited] 
  [Thread 0x40ef2950 (LWP  6755) exited] 
  [Thread 0x400c7950 (LWP  6753) exited] 
  [Thread 0x40d78950 (LWP  6756) exited] 
Program exited normally. 
  (gdb) 
[arnoldg@ac14 sdk]$ 

Question: Where pthreads were used to emulate the device, out on the real device those 10 threads would have run in a thread block. How many device core(s) would have been used to run the thread block?

To debug the actual threads, set a breakpoint in the __global__ kernel. 

Here's an example of that session:

[arnoldg@ac sdk]$ gdb  ./vecadd_dbg 
  GNU gdb Fedora  (6.8-23.fc9) 
  Copyright (C) 2008 Free  Software Foundation, Inc. 
  License GPLv3+: GNU GPL  version 3 or later <http://gnu.org/licenses/gpl.html> 
  This is free software: you  are free to change and redistribute it. 
  There is NO WARRANTY, to  the extent permitted by law.  Type  "show copying" 
  and "show  warranty" for details. 
  This GDB was configured as  "x86_64-redhat-linux-gnu"... 
  (gdb) break 6 
  Breakpoint 1 at 0x407939:  file vecadd.cu, line 6. 
  (gdb) run 
  Starting program:  /home/ac/arnoldg/sdk/vecadd_dbg 
  [Thread debugging using  libthread_db enabled] 
  [New Thread 0x7f3a0a7e3700  (LWP 16825)] 
  NVIDIA: could not open the  device file /dev/nvidiactl (No such file or directory). 
  [New Thread 0x4034f950  (LWP 16828)] 
  [New Thread 0x40bcc950  (LWP 16829)] 
  [New Thread 0x411b7950  (LWP 16830)] 
  [New Thread 0x40d87950  (LWP 16831)] 
  [New Thread 0x41324950  (LWP 16832)] 
  [New Thread 0x40716950  (LWP 16833)] 
  [New Thread 0x4180d950  (LWP 16834)] 
  [New Thread 0x41c7d950  (LWP 16835)] 
  [New Thread 0x40888950  (LWP 16836)] 
  [New Thread 0x403a6950  (LWP 16837)] 
  [New Thread 0x4080b950  (LWP 16838)] 
  [New Thread 0x41280950  (LWP 16839)] 
  [New Thread 0x408c9950  (LWP 16840)] 
  [New Thread 0x41bb9950  (LWP 16841)] 
  [Switching to Thread  0x41324950 (LWP 16832)] 
Breakpoint 1, vecAdd  (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) 
  at vecadd.cu:6 
  6       B[i]=i; 
  Current language:  auto; currently c++ 
  Missing separate  debuginfos, use: debuginfo-install gcc.x86_64 glibc.x86_64 zlib.x86_64 
  (gdb) display i 
  1: i = 0 
  (gdb) cont 
  Continuing. 
  [Switching to Thread  0x40716950 (LWP 16833)] 
Breakpoint 1, vecAdd  (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) 
  at vecadd.cu:6 
  6       B[i]=i; 
  1: i = 1 
  (gdb) cont 
  Continuing. 
  [Switching to Thread  0x4180d950 (LWP 16834)] 
Breakpoint 1, vecAdd  (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) 
  at vecadd.cu:6 
  6       B[i]=i; 
  1: i = 2 
  (gdb) thread 
  [Current thread is 8  (Thread 0x4180d950 (LWP 16834))] 
  (gdb) cont 
  Continuing. 
  [Switching to Thread  0x41c7d950 (LWP 16835)] 
Breakpoint 1, vecAdd  (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) 
  at vecadd.cu:6 
  6       B[i]=i; 
  1: i = 3 
  (gdb) thread 
  [Current thread is 9  (Thread 0x41c7d950 (LWP 16835))] 
  (gdb) cont 
  Continuing. 
  [Switching to Thread  0x40888950 (LWP 16836)] 
Breakpoint 1, vecAdd  (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) 
  at vecadd.cu:6 
  6       B[i]=i; 
  1: i = 4 
  (gdb) cont 
  Continuing. 
  [Switching to Thread  0x403a6950 (LWP 16837)] 
  
  Breakpoint 1, vecAdd  (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) 
  at vecadd.cu:6 
  6       B[i]=i; 
  1: i = 5 
  (gdb) cont 
  Continuing. 
  [Switching to Thread  0x4080b950 (LWP 16838)] 
Breakpoint 1, vecAdd  (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) 
  at vecadd.cu:6 
  6       B[i]=i; 
  1: i = 6 
  (gdb) thread 
  [Current thread is 12  (Thread 0x4080b950 (LWP 16838))] 
  (gdb) thread apply all  continue 
Thread 15 (Thread  0x41bb9950 (LWP 16841)): 
  Continuing. 
  [Switching to Thread  0x41280950 (LWP 16839)] 
Breakpoint 1, vecAdd  (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) 
  at vecadd.cu:6 
  6       B[i]=i; 
  1: i = 7 
Thread 14 (Thread  0x408c9950 (LWP 16840)): 
  Continuing. 
  [Switching to Thread  0x408c9950 (LWP 16840)] 
Breakpoint 1, vecAdd  (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) 
  at vecadd.cu:6 
  6       B[i]=i; 
  1: i = 8 
Thread 13 (Thread  0x41280950 (LWP 16839)): 
  Continuing. 
  [Switching to Thread  0x41bb9950 (LWP 16841)] 
---Type <return> to  continue, or q <return> to quit--- 
  Breakpoint 1, vecAdd  (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) 
  at vecadd.cu:6 
  6       B[i]=i; 
  1: i = 9 
Thread 12 (Thread  0x4080b950 (LWP 16838)): 
Continuing.
[Thread 0x40716950 (LWP 16833) exited] [Thread 0x41324950 (LWP 16832) exited] [Thread 0x408c9950 (LWP 16840) exited] [Thread 0x41c7d950 (LWP 16835) exited] [Thread 0x4080b950 (LWP 16838) exited] [Thread 0x40888950 (LWP 16836) exited] [Thread 0x41280950 (LWP 16839) exited] [Thread 0x403a6950 (LWP 16837) exited] [Thread 0x41bb9950 (LWP 16841) exited] [Thread 0x4180d950 (LWP 16834) exited] C[0]=0.000000 C[1]=1.000000 C[2]=2.000000 C[3]=3.000000 C[4]=4.000000 C[5]=5.000000 C[6]=6.000000 C[7]=7.000000 C[8]=8.000000 C[9]=9.000000 [Thread 0x411b7950 (LWP 16830) exited] [Thread 0x40d87950 (LWP 16831) exited] [Thread 0x4034f950 (LWP 16828) exited] [Thread 0x40bcc950 (LWP 16829) exited]
Program exited normally. 
Segmentation fault  # varies by gdb session, Nvidia driver version [arnoldg@ac sdk]$

Question: Debugging with device emulation mode is the only way to see what happens to threads in a kernel.  Remember there's no printf() out on the device.  Why?

BREAK, 30 minutes

Review of 1st  hour, questions ?

Nvidia SDK

Reduction SDK example: test cases

$ cd projects/reduction
  $  ../../bin/linux/release/deviceQuery
  …
  Device 3: "Quadro FX  5600" 
  Major revision number:                         1 
  Minor revision number:                         0 
  Total amount of global memory:                 1610350592 bytes 
  Number of multiprocessors:                     16 
  Number of cores:                               128 
  Total amount of constant memory:               65536 bytes 
  Total amount of shared memory per block:       16384 bytes 
  Total number of registers available per  block: 8192 
  Warp size:                                     32 
  Maximum number of threads per block:           512 
  Maximum sizes of each dimension of a  block:    512 x 512 x 64 
  Maximum sizes of each dimension of a  grid:     65535 x 65535 x 1 
  Maximum memory pitch:                          262144 bytes 
  Texture alignment:                             256 bytes 
  Clock rate:                                    1.35 GHz 
  Concurrent copy and execution:                 No 
  ...
[arnoldg@abe1391  reduction]$ ../../bin/linux/release/reduction --kernel=5 --n=16384 
  Reducing array of type  int. 
  Using Device 0:  "Tesla C1060" 
  16384 elements 
  128 threads (max) 
  64 blocks 
  Average time: 0.025320 ms 
  Bandwidth:    2.588309 GB/s 
GPU result = 2089065 
  CPU result = 2089065 
  TEST PASSED 
  [arnoldg@abe1391  reduction]$ ../../bin/linux/release/reduction --kernel=5 --threads=1024  --n=16384 
  Reducing array of type  int. 
  Using Device 0:  "Tesla C1060" 
  16384 elements 
  1024 threads (max) 
  8 blocks 
  Error: too many threads or  blocks, exiting... 
  [arnoldg@abe1391  reduction]$ 
  [arnoldg@abe1391  reduction]$ vi reduction_kernel.cu  #  comment warning, exit line 329...
  [arnoldg@qp16 reduction]$  diff reduction_kernel.cu   reduction_kernel.cu.ok
  329,337d328
  <     struct cudaDeviceProp mycudaDeviceProperties;
  <      cudaGetDeviceProperties(&mycudaDeviceProperties , 0);
  <     if( (threads >  mycudaDeviceProperties.maxThreadsPerBlock) ||
  <         (blocks >  mycudaDeviceProperties.maxGridSize[0]) )
  <     {
  <         printf("Error: too many threads  or blocks, exiting...\n");
  <         exit(1);
  <     }
  < 
  [arnoldg@abe1391  reduction]$ make clean 
  [arnoldg@abe1391  reduction]$ make 
  [arnoldg@abe1391  reduction]$ ../../bin/linux/release/reduction --kernel=5 --threads=1024 --n=16384 
  Reducing array of type  int. 
  Using Device 0:  "Tesla C1060" 
  16384 elements 
  1024 threads (max) 
  8 blocks 
  Average time: 0.019560 ms 
  Bandwidth:    3.350509 GB/s 
GPU result = 106503 
  CPU result = 2089065 
  TEST FAILED 

A CUDA program may run incorrectly when device limits are exceeded.  There is no capability for segmentation fault or floating point exception traps.  It's the programmer's responsibility to ensure that kernels are launched within device constraints.  The driver is getting better at detecting some of these conditions, but it's not completely reliable at this time.

gprof notes:
  vi  ../../common/common.mk     # add -pg to  the Compilers section ,  lines 65-67  including nvcc
  vi reduction.cu  # change 100 iterations to 1 iteration
  make clean; make
  cp  ../../bin/linux/release/reduction  .
  ./reduction --kernel=5 --n=4194304
  gprof reduction
  ./reduction --kernel=6  --n=16777216
  gprof reduction
  ./reduction --kernel=3  --n=64
  gprof reduction
  # note that the host cpu  is competitive when data fit within cache

Question: When would it make sense to run a kernel with 128 total threads on the device?

Porting compute  PI into the CUDA reduction example.  This is a classic sample compute pi MPI application:

[arnoldg@ac14  reductionPI]$ cat /home/ac/arnoldg/cuda101/cpi.c 
  #include "mpi.h" 
  #include <stdio.h> 
  #include <math.h> 
  int main( int argc, char  *argv[] ) 
  { 
  int n, myid, numprocs, i; 
  double PI25DT = 3.141592653589793238462643; 
  double mypi, pi, h, sum, x; 
  MPI_Init(&argc,&argv); 
  MPI_Comm_size(MPI_COMM_WORLD,&numprocs); 
  MPI_Comm_rank(MPI_COMM_WORLD,&myid); 
  while (1) { 
  if (myid == 0) { 
  printf("Enter the number of  intervals: (0 quits) "); 
  scanf("%d",&n); 
  } 
  // send 1 MPI_INT “n” from rank 0 to  MPI_COMM_WORLD
  MPI_Bcast(&n, 1, MPI_INT, 0,  MPI_COMM_WORLD); 
  if (n == 0) 
  break; 
  else { 
  h    = 1.0 / (double) n; 
  sum = 0.0; 
  // set loop stride to numprocs ,  each process
  // will do 1/numprocs of the work  calculating
  // partial sums
  for (i = myid + 1; i <= n; i +=  numprocs) { 
  x = h * ((double)i - 0.5); 
  sum += (4.0 / (1.0 + x*x)); 
  } 
  mypi = h * sum;
  // collect 1 MPI_DOUBLE mypi from  each process and 
  // accumulate the result in pi 
  MPI_Reduce(&mypi,  &pi, 1, MPI_DOUBLE, MPI_SUM, 0, 
  MPI_COMM_WORLD); 
  if (myid == 0)  
  printf("pi is  approximately %.16f, Error is %.16f\n", 
  pi, fabs(pi - PI25DT)); 
  } 
  } 
  MPI_Finalize(); 
  return 0; 
  } 
  [arnoldg@ac14  reductionPI]$ 

Question: Can we port the for loop and MPI_Reduce() from cpi.c to reduction.cu and reduction_kernel.cu ?

Imagine the for loop replaced by a fixed number of threads [16384 works well].  n from cpi.c corresponds to size in reduction.cu and each thread will need that value [h_idata near the end of reduction.cu].   Assigning size to each value of the input array [h_idata] duplicates the work of MPI_Bcast().

The work of the cpi.c for loop would end  up in one of the kernel cases of  reduction_kernel.cu .  A sample program invocation might  resemble:

../../bin/linux/release/reductionPI  –kernel=1 –type=float -n=16384

This was accomplished with just a few lines of code added to the reduction project.  It looks pretty simple, however debugging it was a time consuming process.

[arnoldg@qp reduction]$  diff reduction.cu reduction.cu.ok
  456,459d455
  <             if ((whichKernel==1 ) &&  (datatype == REDUCE_FLOAT))
  <           {
  <               h_idata[i]= (float) size;
  <           }
  522d517
  <             printf("GPU result*h =  %0.12f\n", gpu_result * (1.0/size));
  [arnoldg@qp reduction]$ 
  # insert before:  sdata[tid] = g_idata[i];
  
[arnoldg@qp reduction]$  diff reduction_kernel.cu reduction_kernel.cu.ok
  105,113d104
  < // These lines would  have been done in a for loop with numprocs stride for
  < // each mpi rank, on  the gpu each thread will do a single loop iteration
  < if ( g_idata[i]  ==16384.0 )  // 1st trip only,  all others reduce
  < {
  <     float h= 1.0 / ( g_idata[i]);
  <     float x = h*( (float) i - 0.5);
  <     sdata[tid] = (4.0 / ( 1.0 + x*x));
  < }
  < else
make clean; make
[arnoldg@abe1202  reductionPI]$ ../../bin/linux/release/reductionPI --kernel=1 --type=float  -n=16384
  Reducing array of type  float.
  Using Device 0:  "Tesla C1060"
  16384 elements
  128 threads (max)
  128 blocks
  Average time: 0.036000 ms
  Bandwidth:    1.820445 GB/s

GPU result = 3.141714811325

Combining MPI and CUDA

The simplest way forward is to use nvcc for everything.  The nvcc compiler wrapper is somewhat more complex than the typical mpicc compiler wrapper, so it's easier to make MPI code into .cu and compile with nvcc than the other way around.  A sample makefile might resemble:

[arnoldg@ac14 mpi-gpu]$  cat Makefile 
  MPICC      := nvcc -Xptxas -v 
  MPI_INCLUDES    := /usr/mpi/intel/mvapich2-1.2p1/include 
  MPI_LIBS   := /usr/mpi/intel/mvapich2-1.2p1/lib 
%.o : %.cu 
  $(MPICC) -I$(MPI_INCLUDES) -o $@ -c $< 
mpi_hello_gpu : vecadd.o  mpi_hello_gpu.o 
  $(MPICC) -L$(MPI_LIBS) -lmpich -o $@ *.o 
clean : 
  rm vecadd.o mpi_hello_gpu.o 
all : mpi_hello_gpu 

Source code  files follow.

[arnoldg@ac14 mpi-gpu]$  cat mpi_hello_gpu.cu 
  #include <mpi.h> 
  #include <stdio.h> 
  #include <stdlib.h> 
  #define PPN 4 
  #define INTARRAYLEN 65535 
  #define BCASTREPS 1000 
int main(int argc, char  *argv[]) 
  { 
  int bcastme[INTARRAYLEN], ranksum; 
  int rank, size, len; 
  int gpudevice; 
  int vecadd(int, int); 
  char name[MPI_MAX_PROCESSOR_NAME]; 
        MPI_Init(&argc, &argv); 
  MPI_Comm_rank(MPI_COMM_WORLD,  &rank); 
  MPI_Comm_size(MPI_COMM_WORLD,  &size); 
  MPI_Get_processor_name(name, &len); 
     // do some MPI work, showing MPI and CUDA being run from one  routine 
  if (rank == 0) { bcastme[3]=3; } 
  for (int i=0; i<BCASTREPS; i++) 
  { 
  MPI_Bcast(bcastme, INTARRAYLEN, MPI_INT, 0,  MPI_COMM_WORLD); 
  } 
     // modulo is useful in determining unique gpu device ids if  ranks 
  // are packed into nodes and not assigned in round robin  fashion 
  
  gpudevice= rank % PPN; 
  printf("rank %d of %d on %s received bcastme[3]=%d [gpu  %d]\n", rank, size, name,bcastme[3], gpudevice); 
     
  vecadd(gpudevice, rank); 
     // more MPI work showing MPI is functional after CUDA 
  MPI_Reduce(&rank, &ranksum, 1,  MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); 
  if (rank == 0) { printf("ranksum= %d\n", ranksum); } 
     MPI_Finalize(); 
  } 
  [arnoldg@ac14 mpi-gpu]$

Parameters for passing the MPI rank and selecting a gpu were added to vecadd.

  [arnoldg@ac14 mpi-gpu]$  cat vecadd.cu 
  // Kernel definition 
  __global__ void  vecAdd(float* A, float* B, float* C) 
  { 
  int i = threadIdx.x; 
  A[i]=0; 
  B[i]=i; 
  C[i] = A[i] + B[i]; 
  } 
#include <stdio.h> 
  #define SIZE 10 
  #define KERNELINVOKES  5000000 
  int vecadd(int  gpudevice, int rank) 
  { 
  int devcheck(int, int); 
  devcheck(gpudevice, rank); 
    float A[SIZE], B[SIZE], C[SIZE]; 
  // Kernel invocation 
    float *devPtrA; 
  float *devPtrB; 
  float *devPtrC; 
  int memsize= SIZE * sizeof(float); 
    cudaMalloc((void**)&devPtrA, memsize); 
  cudaMalloc((void**)&devPtrB, memsize); 
  cudaMalloc((void**)&devPtrC, memsize); 
  cudaMemcpy(devPtrA, A, memsize,  cudaMemcpyHostToDevice); 
  cudaMemcpy(devPtrB, B, memsize,  cudaMemcpyHostToDevice); 
  for (int i=0; i<KERNELINVOKES; i++) 
  { 
  vecAdd<<<1,  gpudevice>>>(devPtrA, devPtrB, devPtrC); 
  } 
  cudaMemcpy(C, devPtrC, memsize,  cudaMemcpyDeviceToHost); 
  // calculate only up to  gpudevice to show the unique output 
  // of each rank's kernel  launch 
  for (int i=0; i<gpudevice; i++) 
  printf("rank %d: C[%d]=%f\n",rank,i,C[i]); 
  cudaFree(devPtrA); 
  cudaFree(devPtrA); 
  cudaFree(devPtrA); 
  } 
int devcheck(int  gpudevice, int rank) 
  { 
  int device_count=0; 
  int device;   // used with cudaGetDevice() to verify cudaSetDevice() 
  cudaGetDeviceCount( &device_count); 
  if (gpudevice >= device_count) 
  { 
  printf("gpudevice >=  device_count ... exiting\n"); 
  exit(1); 
  } 
  cudaError_t cudareturn; 
  cudaDeviceProp deviceProp; 
  cudaGetDeviceProperties(&deviceProp,  gpudevice); 
  if (deviceProp.warpSize <= 1) 
  { 
  printf("rank %d: warning, CUDA Device Emulation (CPU)  detected, exiting\n", rank); 
  exit(1); 
  } 
  cudareturn=cudaSetDevice(gpudevice); 
  if (cudareturn == cudaErrorInvalidDevice) 
  { 
  perror("cudaSetDevice returned  cudaErrorInvalidDevice"); 
  } 
  else 
  { 
  cudaGetDevice(&device); 
  printf("rank %d:  cudaGetDevice()=%d\n",rank,device); 
  } 
} 
  [arnoldg@ac14 mpi-gpu]$ 

Tip:

Some MPI implementations will use locked memory along with CUDA.  There's no good convention currently in place to deal with potential resource contention for locked memory between MPI and CUDA.   It may make sense to avoid cudaMallocHost() and cudaMemcpy*Async() in cases where MPI also needs locked memory for buffers.  Mvapich [for Infiniband clusters] requires some locked memory.

Hardware tip:

For systems with multiple PCI buses [lspci -tv] or multiple NUMA nodes [numactl –hardware] you can use the taskset command and build a memory bandwidth performance table with a few command iterations of bandwidthTest.  Test the cpus with taskset -c and the gpus with –device=N.

  taskset -c 3  ../../bin/linux/release/bandwidthTest --device=0 --htod
  
  MB/s performance table for ac13
  
  nvidia0 nvidia1 nvidia2 nvidia3
  cpu0  1247    1263    1511    1504
  cpu1  1576    1588    1509    1499
  cpu2  1259    1257    1507    1501
  cpu3  1591    1574    1509    1487
  
  That table suggests that cpus 0,2 get along best with the nvidia2,3 devices.
  That seems to agree with the value of local_cpulist from the 2nd and 3rd  entries listed from "lspci -tv":
  
  cd /sys/devices/ ; find .  -name local_cpulist -exec cat {} \;   #  or local_cpus 
  ...
  pci0000:00/0000:00:0f.0/0000:18:00.0/0000:19:01.0/0000:1b:00.0/0000:1c:00.0/0000:1d:00.0/local_cpulist
  0,2
  pci0000:00/0000:00:0f.0/0000:18:00.0/0000:19:01.0/0000:1b:00.0/0000:1c:01.0/0000:1e:00.0/local_cpulist
  0,2

The ordering as shown in lspci matches up to /dev/nvidiaN as a simple 0,1,2,3 numbering.
Newer Nvidia driver and hardware also map gpu numbers to pci locations:

[arnoldg@abe1391  reduction]$ cat /proc/driver/nvidia/cards/1 
  Model:           Tesla C1060 
  IRQ:             16 
  Video BIOS:      ??.??.??.??.?? 
  Card Type:       PCI-E 
  DMA Size:   40 bits 
  DMA Mask:   0xffffffffff 
  Bus Location:    14.00.0
  
  [arnoldg@abe1391 ~]$  /sbin/lspci | grep "3D controller" 
  12:00.0 3D controller:  nVidia Corporation: Unknown device 05e7 (rev a1) 
  14:00.0 3D controller:  nVidia Corporation: Unknown device 05e7 (rev a1)

Summary points and questions.

Cuda programming requires a hands-on approach.  Check error and return codes, check for presence of a device, and check results for correctness. 

What's the cost of a branch [if (rank/process/thread == 3) { take alt. Path } ]to relative performance of the following parallel programming models: MPI, OpenMP, CUDA

No fortran was harmed in this tutorial.

Until now...

http://www.pgroup.com/lit/pgi_whitepaper_accpre.pdf

http://www.clustermonkey.net//content/view/248/33/1/1/

Questions:

Would an MPI or OpenMP programmer be  comfortable porting to the PGI compilers with accelerator directives?

Which programmer would have the easier path to PGI acceleration ? 

Which programmer might find CUDA more appealing?