nvcc vecadd.cu # login host ./vecadd # login host and compute host
[arnoldg@ac sdk]$ cat -n vecadd.cu
1 // Kernel definition, see also section 4.2.3 of Nvidia Cuda Programming Guide
2 __global__ void vecAdd(float* A, float* B, float* C)
3 {
4 // threadIdx.x is a built-in variable provided by CUDA at runtime
5 int i = threadIdx.x;
6 A[i]=0;
7 B[i]=i;
8 C[i] = A[i] + B[i];
9 }
10
11 #include <stdio.h>
12 #define SIZE 10
13 int main()
14 {
15 int N=SIZE;
16 float A[SIZE], B[SIZE], C[SIZE];
17 float *devPtrA;
18 float *devPtrB;
19 float *devPtrC;
20 int memsize= SIZE * sizeof(float);
21
22 cudaMalloc((void**)&devPtrA, memsize);
23 cudaMalloc((void**)&devPtrB, memsize);
24 cudaMalloc((void**)&devPtrC, memsize);
25 cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);
26 cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
27 // __global__ functions are called: Func<<< Dg, Db, Ns >>>(parameter);
28 vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);
29 cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
30
31 for (int i=0; i<SIZE; i++)
32 printf("C[%d]=%f\n",i,C[i]);
33
34 cudaFree(devPtrA);
35 cudaFree(devPtrA);
36 cudaFree(devPtrA);
37 }
[arnoldg@ac sdk]$
This is the output from the abe cluster:
[arnoldg@abe1391 ~/sdk2.0]$ ./vecadd C[0]=0.000000 C[1]=1.000000 C[2]=2.000000 C[3]=3.000000 C[4]=4.000000 C[5]=5.000000 C[6]=6.000000 C[7]=7.000000 C[8]=8.000000 C[9]=9.000000 [arnoldg@abe1391 ~/sdk2.0]$ vi vecadd.cu # comment out kernel funtion call [arnoldg@abe1391 ~/sdk2.0]$ nvcc vecadd.cu vecadd.cu(15): warning: variable "N" was declared but never referenced vecadd.cu(15): warning: variable "N" was declared but never referenced
Note the kernel was commented. It doesn't run in the case below, but the output still appears to be correct. Why ?
[arnoldg@abe1391 ~/sdk2.0]$ ./a.out C[0]=0.000000 C[1]=1.000000 C[2]=2.000000 C[3]=3.000000 C[4]=4.000000 C[5]=5.000000 C[6]=6.000000 C[7]=7.000000 C[8]=8.000000 C[9]=9.000000 [arnoldg@abe1391 ~/sdk2.0]$
Running the program on a login node of abe yields:
[arnoldg@honest3 ~/sdk2.0]$ ./vecadd NVIDIA: could not open the device file /dev/nvidiactl (No such device or address). C[0]=-0.000548 C[1]=0.000000 C[2]=0.000000 C[3]=0.000000 C[4]=0.000000 C[5]=0.000000 C[6]=0.000000 C[7]=0.000000 C[8]=0.000000 C[9]=0.000000
Question: The program ran, emitted a warning [to stderr, not stdout] and produced incorrect results. Why ?
Hints: ls -l /dev/nvidia* , /sbin/lsmod | grep nvidia
Answer: A CUDA program may run without a device and yield unexpected results. It's important to check for an Nvidia device within the program. Program defensively.
[arnoldg@honest3 ~/sdk2.0]$ cat -n vecadd_devdetect.cu
1 // Kernel definition
2 __global__ void vecAdd(float* A, float* B, float* C)
3 {
4 int i = threadIdx.x;
5 A[i]=0;
6 B[i]=i;
7 C[i] = A[i] + B[i];
8 }
9
10 #include <stdio.h>
11 #define SIZE 10
12 int main()
13 {
14 int devcheck(int);
15 devcheck(0);
16
17 int N=SIZE;
18 float A[SIZE], B[SIZE], C[SIZE];
19 // Kernel invocation
20
21 float *devPtrA;
22 float *devPtrB;
23 float *devPtrC;
24 int memsize= SIZE * sizeof(float);
25
26 cudaMalloc((void**)&devPtrA, memsize);
27 cudaMalloc((void**)&devPtrB, memsize);
28 cudaMalloc((void**)&devPtrC, memsize);
29 cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);
30 cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
31 vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);
32 cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
33
34 for (int i=0; i<SIZE; i++)
35 printf("C[%d]=%f\n",i,C[i]);
36
37 cudaFree(devPtrA);
38 cudaFree(devPtrA);
39 cudaFree(devPtrA);
40 }
41
42 int devcheck(int gpudevice)
43 {
44 int device_count=0;
45 int device; // used with cudaGetDevice() to verify cudaSetDevice()
46
47 // get the number of non-emulation devices detected
48 cudaGetDeviceCount( &device_count);
49 if (gpudevice > device_count)
50 {
51 printf("gpudevice >= device_count ... exiting\n");
52 exit(1);
53 }
54 cudaError_t cudareturn;
55 cudaDeviceProp deviceProp;
56
57 // cudaGetDeviceProperties() is also demonstrated in the deviceQuery/ example
58 // of the sdk projects directory
59 cudaGetDeviceProperties(&deviceProp, gpudevice);
60 printf("[deviceProp.major.deviceProp.minor] = [%d.%d]\n",
61 deviceProp.major, deviceProp.minor);
62
63 if (deviceProp.major > 999)
64 {
65 printf("warning, CUDA Device Emulation (CPU) detected, exiting\n");
66 exit(1);
67 }
68
69 // choose a cuda device for kernel execution
70 cudareturn=cudaSetDevice(gpudevice);
71 if (cudareturn == cudaErrorInvalidDevice)
72 {
73 perror("cudaSetDevice returned cudaErrorInvalidDevice");
74 }
75 else
76 {
77 // double check that device was properly selected
78 cudaGetDevice(&device);
79 printf("cudaGetDevice()=%d\n",device);
80 }
81 }
[arnoldg@honest3 ~/sdk2.0]$
After compiling that version, the program exits when running without a device.
[arnoldg@honest3 ~/sdk2.0]$ ./vecadd_devdetect NVIDIA: could not open the device file /dev/nvidiactl (No such device or address). [deviceProp.major.deviceProp.minor] = [9999.9999] warning, CUDA Device Emulation (CPU) detected, exiting [arnoldg@honest3 ~/sdk2.0]$
$ nvcc –help # There are some rather unique options to the compiler.
[arnoldg@honest3 ~/sdk2.0]$ nvcc --keep --opencc-options -LIST:source=on vecadd.cu
[arnoldg@honest3 ~/sdk2.0]$ cat vecadd.ptx
1 .version 1.2
2 .target sm_10, map_f64_to_f32
3 // compiled with /usr/local/cuda/open64/lib//be
4 // nvopencc built on 2008-06-19
5
6 .reg .u32 %ra<17>;
7 .reg .u64 %rda<17>;
8 .reg .f32 %fa<17>;
9 .reg .f64 %fda<17>;
10 .reg .u32 %rv<5>;
11 .reg .u64 %rdv<5>;
12 .reg .f32 %fv<5>;
13 .reg .f64 %fdv<5>;
14
15
16 //-----------------------------------------------------------
17 // Compiling vecadd.cpp3.i (/tmp/ccBI#.UWx7Mj)
18 //-----------------------------------------------------------
19
20 //-----------------------------------------------------------
21 // Options:
22 //-----------------------------------------------------------
23 // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
24 // -O3 (Optimization level)
25 // -g0 (Debug level)
26 // -m2 (Report advisories)
27 //-----------------------------------------------------------
28
29 .file 1 "vecadd.cudafe2.gpu"
30 .file 2 "/usr/lib/gcc/x86_64-redhat-linux/3.4.6/include/stddef.h"
31 .file 3 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
32 .file 4 "/usr/local/cuda/bin/../include/crt/../host_defines.h"
33 .file 5 "/usr/local/cuda/bin/../include/crt/../builtin_types.h"
34 .file 6 "/usr/local/cuda/bin/../include/crt/../device_types.h"
35 .file 7 "/usr/local/cuda/bin/../include/crt/../driver_types.h"
36 .file 8 "/usr/local/cuda/bin/../include/crt/../texture_types.h"
37 .file 9 "/usr/local/cuda/bin/../include/crt/../vector_types.h"
38 .file 10 "/usr/local/cuda/bin/../include/crt/../device_launch_parameters.h"
39 .file 11 "/usr/local/cuda/bin/../include/crt/storage_class.h"
40 .file 12 "/usr/include/bits/types.h"
41 .file 13 "/usr/include/time.h"
42 .file 14 "vecadd.cu"
43 .file 15 "/usr/local/cuda/bin/../include/common_functions.h"
44 .file 16 "/usr/local/cuda/bin/../include/crt/func_macro.h"
45 .file 17 "/usr/local/cuda/bin/../include/math_functions.h"
46 .file 18 "/usr/local/cuda/bin/../include/device_functions.h"
47 .file 19 "/usr/local/cuda/bin/../include/math_constants.h"
48 .file 20 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
49 .file 21 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
50 .file 22 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
51 .file 23 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
52 .file 24 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
53
54
55 .entry __globfunc__Z6vecAddPfS_S_
56 {
57 .reg .u32 %r<3>;
58 .reg .u64 %rd<10>;
59 .reg .f32 %f<6>;
60 .param .u64 __cudaparm___globfunc__Z6vecAddPfS_S__A;
61 .param .u64 __cudaparm___globfunc__Z6vecAddPfS_S__B;
62 .param .u64 __cudaparm___globfunc__Z6vecAddPfS_S__C;
63 .loc 14 2 0
64 // 1 // Kernel definition
65 // 2 __global__ void vecAdd(float* A, float* B, float* C)
66 $LBB1___globfunc__Z6vecAddPfS_S_:
67 .loc 14 5 0
68 // 3 {
69 // 4 int i = threadIdx.x;
70 // 5 A[i]=0;
71 cvt.s32.u16 %r1, %tid.x; //
72 cvt.u64.s32 %rd1, %r1; //
73 mul.lo.u64 %rd2, %rd1, 4; //
74 ld.param.u64 %rd3, [__cudaparm___globfunc__Z6vecAddPfS_S__A];// id:22 __cudaparm___globfunc__Z6vecAddPfS_S__A+0x0
75 add.u64 %rd4, %rd3, %rd2; //
76 mov.f32 %f1, 0f00000000; // 0
77 st.global.f32 [%rd4+0], %f1; // id:23
78 .loc 14 6 0
79 // 6 B[i]=i;
80 cvt.rn.f32.s32 %f2, %r1; //
81 ld.param.u64 %rd5, [__cudaparm___globfunc__Z6vecAddPfS_S__B];// id:24 __cudaparm___globfunc__Z6vecAddPfS_S__B+0x0
82 add.u64 %rd6, %rd5, %rd2; //
83 st.global.f32 [%rd6+0], %f2; // id:25
84 .loc 14 7 0
85 // 7 C[i] = A[i] + B[i];
86 ld.global.f32 %f3, [%rd4+0]; // id:26
87 add.f32 %f4, %f3, %f2; //
88 ld.param.u64 %rd7, [__cudaparm___globfunc__Z6vecAddPfS_S__C];// id:27 __cudaparm___globfunc__Z6vecAddPfS_S__C+0x0
89 add.u64 %rd8, %rd7, %rd2; //
90 st.global.f32 [%rd8+0], %f4; // id:28
91 .loc 14 8 0
92 // 8 }
93 exit; //
94 $LDWend___globfunc__Z6vecAddPfS_S_:
95 } // __globfunc__Z6vecAddPfS_S_
96
[arnoldg@honest3 ~/sdk2.0]$ ls
…
The collection of intermediate files can be cleared away with the addition of the -clean flag.
See also ptx_isa_1.2.pdf or similar in /usr/local/cuda/doc/ . Ch. 2, Ch. 4
[arnoldg@honest3 ~/sdk2.0]$ nvcc --ptxas-options=-v vecadd.cu ptxas info : Compiling entry function '__globfunc__Z6vecAddPfS_S_' ptxas info : Used 4 registers, 40+32 bytes smem [arnoldg@honest3 ~/sdk2.0]$
scroll down to: Programming Questions 4, 29, 30
Question:
Vecadd revealed via gdb: The sample program can be run in device emulation mode on a system without an Nvidia device and driver loaded for debugging purposes. The host emulation mode for CUDA tries to mimic the device by using pthreads where the device would run hardware threads. What might the device and driver need in order to support a rich gdb-like debugging environment?
Hint: more o/s in the driver for break/interrupts, more silicon on the device and additional instructions in CUDA ptx assembler...
[arnoldg@ac sdk]$ nvcc -g --device-emulation -o vecadd_dbg vecadd.cu
[arnoldg@ac sdk]$ ./vecadd_dbg # compare with running this on host w/ device
NVIDIA: could not open the device file /dev/nvidiactl (No such file or directory).
C[0]=0.000000
C[1]=1.000000
C[2]=2.000000
C[3]=3.000000
C[4]=4.000000
C[5]=5.000000
C[6]=6.000000
C[7]=7.000000
C[8]=8.000000
C[9]=9.000000
[arnoldg@ac sdk]$ gdb ./vecadd_dbg
GNU gdb Fedora (6.8-23.fc9)
Copyright (C) 2008 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu"...
(gdb) l
6 B[i]=i;
7 C[i] = A[i] + B[i];
8 }
9
10
11 #include <stdio.h>
12 #define SIZE 10
13 int main()
14 {
15 int device_count = 0; (gdb) l 16 cudaGetDeviceCount( &device_count );
17
18 int N=SIZE;
19 float A[SIZE], B[SIZE], C[SIZE];
20 // Kernel invocation
21
22 float *devPtrA;
23 float *devPtrB;
24 float *devPtrC;
25 int memsize= SIZE * sizeof(float);
(gdb) l
26
27 cudaMalloc((void**)&devPtrA, memsize);
28 cudaMalloc((void**)&devPtrB, memsize);
29 cudaMalloc((void**)&devPtrC, memsize);
30 cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);
31 cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
32 vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);
33 cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
34
35 for (int i=0; i<SIZE; i++)
(gdb) l
36 printf("C[%d]=%f\n",i,C[i]);
37
38 cudaFree(devPtrA);
39 cudaFree(devPtrA);
40 cudaFree(devPtrA);
41 }
42
(gdb) b 32
Breakpoint 1 at 0x40b77c: file vecadd.cu, line 32.
(gdb) run Starting program: /home/ac/arnoldg/sdk/vecadd_dbg
[Thread debugging using libthread_db enabled]
[New Thread 0x7f8dde6f7700 (LWP 6750)]
NVIDIA: could not open the device file /dev/nvidiactl (No such file or directory).
[New Thread 0x400c7950 (LWP 6753)]
[New Thread 0x41e26950 (LWP 6754)]
[New Thread 0x40ef2950 (LWP 6755)]
[New Thread 0x40d78950 (LWP 6756)]
Breakpoint 1, main () at vecadd.cu:32
32 vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);
Current language: auto; currently c++
Missing separate debuginfos, use: debuginfo-install gcc.x86_64 glibc.x86_64 zlib.x86_64
(gdb) s tep
dim3 (this=0x7fffe67100a0, x=10, y=1, z=1)
at /usr/local/cuda/bin/../include/vector_types.h:426
426 dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1) : x(x), y(y), z(z) {}
(gdb) s
dim3 (this=0x7fffe67100b0, x=1, y=1, z=1)
at /usr/local/cuda/bin/../include/vector_types.h:426
426 dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1) : x(x), y(y), z(z) {}
(gdb) s
__device_stub__Z6vecAddPfS_S_ (__par0=0x7f8dd8000b00, __par1=0x7f8dd8000c00,
__par2=0x7f8dd8000d00)
at /tmp/tmpxft_00001a1f_00000000-1_vecadd.cudafe1.stub.c:13
13 /tmp/tmpxft_00001a1f_00000000-1_vecadd.cudafe1.stub.c: No such file or directory.
in /tmp/tmpxft_00001a1f_00000000-1_vecadd.cudafe1.stub.c
(gdb) s
cudaLaunch<char> (symbol=0x407ab2 "UH\211�H\201�0\001")
at /usr/local/cuda/bin/../include/cuda_runtime.h:327
327 return cudaLaunch((const char*)symbol);
(gdb) step
[New Thread 0x411d9950 (LWP 6757)]
[New Thread 0x41d27950 (LWP 6758)]
[New Thread 0x414b8950 (LWP 6759)]
[New Thread 0x41f03950 (LWP 6760)]
[New Thread 0x403ad950 (LWP 6761)]
[New Thread 0x40cbc950 (LWP 6762)]
[New Thread 0x41923950 (LWP 6763)]
[New Thread 0x40a5a950 (LWP 6764)]
[New Thread 0x405ac950 (LWP 6765)]
[New Thread 0x41c9a950 (LWP 6766)]
[Thread 0x403ad950 (LWP 6761) exited]
[Thread 0x40cbc950 (LWP 6762) exited]
[Thread 0x405ac950 (LWP 6765) exited]
[Thread 0x40a5a950 (LWP 6764) exited]
[Thread 0x41923950 (LWP 6763) exited]
[Thread 0x414b8950 (LWP 6759) exited]
[Thread 0x41d27950 (LWP 6758) exited]
[Thread 0x41c9a950 (LWP 6766) exited]
[Thread 0x411d9950 (LWP 6757) exited]
[Thread 0x41f03950 (LWP 6760) exited]
328 }
(gdb) s
main () at vecadd.cu:33
33 cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
(gdb) n
35 for (int i=0; i<SIZE; i++)
(gdb) p C
$2 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
(gdb) c
Continuing.
C[0]=0.000000
C[1]=1.000000
C[2]=2.000000
C[3]=3.000000
C[4]=4.000000
C[5]=5.000000
C[6]=6.000000
C[7]=7.000000
C[8]=8.000000
C[9]=9.000000
[Thread 0x41e26950 (LWP 6754) exited]
[Thread 0x40ef2950 (LWP 6755) exited]
[Thread 0x400c7950 (LWP 6753) exited]
[Thread 0x40d78950 (LWP 6756) exited]
Program exited normally. (gdb)
[arnoldg@ac14 sdk]$
Question: Where pthreads were used to emulate the device, out on the real device those 10 threads would have run in a thread block. How many device core(s) would have been used to run the thread block?
To debug the actual threads, set a breakpoint in the __global__ kernel.
Here's an example of that session:
[arnoldg@ac sdk]$ gdb ./vecadd_dbg GNU gdb Fedora (6.8-23.fc9) Copyright (C) 2008 Free Software Foundation, Inc. License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html> This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law. Type "show copying" and "show warranty" for details. This GDB was configured as "x86_64-redhat-linux-gnu"... (gdb) break 6 Breakpoint 1 at 0x407939: file vecadd.cu, line 6. (gdb) run Starting program: /home/ac/arnoldg/sdk/vecadd_dbg [Thread debugging using libthread_db enabled] [New Thread 0x7f3a0a7e3700 (LWP 16825)] NVIDIA: could not open the device file /dev/nvidiactl (No such file or directory). [New Thread 0x4034f950 (LWP 16828)] [New Thread 0x40bcc950 (LWP 16829)] [New Thread 0x411b7950 (LWP 16830)] [New Thread 0x40d87950 (LWP 16831)] [New Thread 0x41324950 (LWP 16832)] [New Thread 0x40716950 (LWP 16833)] [New Thread 0x4180d950 (LWP 16834)] [New Thread 0x41c7d950 (LWP 16835)] [New Thread 0x40888950 (LWP 16836)] [New Thread 0x403a6950 (LWP 16837)] [New Thread 0x4080b950 (LWP 16838)] [New Thread 0x41280950 (LWP 16839)] [New Thread 0x408c9950 (LWP 16840)] [New Thread 0x41bb9950 (LWP 16841)] [Switching to Thread 0x41324950 (LWP 16832)]
Breakpoint 1, vecAdd (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) at vecadd.cu:6 6 B[i]=i; Current language: auto; currently c++ Missing separate debuginfos, use: debuginfo-install gcc.x86_64 glibc.x86_64 zlib.x86_64 (gdb) display i 1: i = 0 (gdb) cont Continuing. [Switching to Thread 0x40716950 (LWP 16833)]
Breakpoint 1, vecAdd (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) at vecadd.cu:6 6 B[i]=i; 1: i = 1 (gdb) cont Continuing. [Switching to Thread 0x4180d950 (LWP 16834)]
Breakpoint 1, vecAdd (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) at vecadd.cu:6 6 B[i]=i; 1: i = 2 (gdb) thread [Current thread is 8 (Thread 0x4180d950 (LWP 16834))] (gdb) cont Continuing. [Switching to Thread 0x41c7d950 (LWP 16835)]
Breakpoint 1, vecAdd (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) at vecadd.cu:6 6 B[i]=i; 1: i = 3 (gdb) thread [Current thread is 9 (Thread 0x41c7d950 (LWP 16835))] (gdb) cont Continuing. [Switching to Thread 0x40888950 (LWP 16836)]
Breakpoint 1, vecAdd (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) at vecadd.cu:6 6 B[i]=i; 1: i = 4 (gdb) cont Continuing. [Switching to Thread 0x403a6950 (LWP 16837)] Breakpoint 1, vecAdd (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) at vecadd.cu:6 6 B[i]=i; 1: i = 5 (gdb) cont Continuing. [Switching to Thread 0x4080b950 (LWP 16838)]
Breakpoint 1, vecAdd (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) at vecadd.cu:6 6 B[i]=i; 1: i = 6 (gdb) thread [Current thread is 12 (Thread 0x4080b950 (LWP 16838))] (gdb) thread apply all continue
Thread 15 (Thread 0x41bb9950 (LWP 16841)): Continuing. [Switching to Thread 0x41280950 (LWP 16839)]
Breakpoint 1, vecAdd (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) at vecadd.cu:6 6 B[i]=i; 1: i = 7
Thread 14 (Thread 0x408c9950 (LWP 16840)): Continuing. [Switching to Thread 0x408c9950 (LWP 16840)]
Breakpoint 1, vecAdd (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) at vecadd.cu:6 6 B[i]=i; 1: i = 8
Thread 13 (Thread 0x41280950 (LWP 16839)): Continuing. [Switching to Thread 0x41bb9950 (LWP 16841)]
---Type <return> to continue, or q <return> to quit--- Breakpoint 1, vecAdd (A=0x7f3a04000f00, B=0x7f3a04001000, C=0x7f3a04001100) at vecadd.cu:6 6 B[i]=i; 1: i = 9
Thread 12 (Thread 0x4080b950 (LWP 16838)):
Continuing.
[Thread 0x40716950 (LWP 16833) exited] [Thread 0x41324950 (LWP 16832) exited] [Thread 0x408c9950 (LWP 16840) exited] [Thread 0x41c7d950 (LWP 16835) exited] [Thread 0x4080b950 (LWP 16838) exited] [Thread 0x40888950 (LWP 16836) exited] [Thread 0x41280950 (LWP 16839) exited] [Thread 0x403a6950 (LWP 16837) exited] [Thread 0x41bb9950 (LWP 16841) exited] [Thread 0x4180d950 (LWP 16834) exited] C[0]=0.000000 C[1]=1.000000 C[2]=2.000000 C[3]=3.000000 C[4]=4.000000 C[5]=5.000000 C[6]=6.000000 C[7]=7.000000 C[8]=8.000000 C[9]=9.000000 [Thread 0x411b7950 (LWP 16830) exited] [Thread 0x40d87950 (LWP 16831) exited] [Thread 0x4034f950 (LWP 16828) exited] [Thread 0x40bcc950 (LWP 16829) exited]
Program exited normally.
Segmentation fault # varies by gdb session, Nvidia driver version [arnoldg@ac sdk]$
Question: Debugging with device emulation mode is the only way to see what happens to threads in a kernel. Remember there's no printf() out on the device. Why?
BREAK, 30 minutes
Review of 1st hour, questions ?
$ cd projects/reduction
$ ../../bin/linux/release/deviceQuery
…
Device 3: "Quadro FX 5600"
Major revision number: 1
Minor revision number: 0
Total amount of global memory: 1610350592 bytes
Number of multiprocessors: 16
Number of cores: 128
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 16384 bytes
Total number of registers available per block: 8192
Warp size: 32
Maximum number of threads per block: 512
Maximum sizes of each dimension of a block: 512 x 512 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 262144 bytes
Texture alignment: 256 bytes
Clock rate: 1.35 GHz
Concurrent copy and execution: No
...
[arnoldg@abe1391 reduction]$ ../../bin/linux/release/reduction --kernel=5 --n=16384 Reducing array of type int. Using Device 0: "Tesla C1060" 16384 elements 128 threads (max) 64 blocks Average time: 0.025320 ms Bandwidth: 2.588309 GB/s
GPU result = 2089065
CPU result = 2089065
TEST PASSED
[arnoldg@abe1391 reduction]$ ../../bin/linux/release/reduction --kernel=5 --threads=1024 --n=16384
Reducing array of type int.
Using Device 0: "Tesla C1060"
16384 elements
1024 threads (max)
8 blocks
Error: too many threads or blocks, exiting...
[arnoldg@abe1391 reduction]$
[arnoldg@abe1391 reduction]$ vi reduction_kernel.cu # comment warning, exit line 329...
[arnoldg@qp16 reduction]$ diff reduction_kernel.cu reduction_kernel.cu.ok
329,337d328
< struct cudaDeviceProp mycudaDeviceProperties;
< cudaGetDeviceProperties(&mycudaDeviceProperties , 0);
< if( (threads > mycudaDeviceProperties.maxThreadsPerBlock) ||
< (blocks > mycudaDeviceProperties.maxGridSize[0]) )
< {
< printf("Error: too many threads or blocks, exiting...\n");
< exit(1);
< }
<
[arnoldg@abe1391 reduction]$ make clean
[arnoldg@abe1391 reduction]$ make
[arnoldg@abe1391 reduction]$ ../../bin/linux/release/reduction --kernel=5 --threads=1024 --n=16384
Reducing array of type int.
Using Device 0: "Tesla C1060"
16384 elements
1024 threads (max)
8 blocks
Average time: 0.019560 ms
Bandwidth: 3.350509 GB/s
GPU result = 106503 CPU result = 2089065 TEST FAILED
A CUDA program may run incorrectly when device limits are exceeded. There is no capability for segmentation fault or floating point exception traps. It's the programmer's responsibility to ensure that kernels are launched within device constraints. The driver is getting better at detecting some of these conditions, but it's not completely reliable at this time.
gprof notes: vi ../../common/common.mk # add -pg to the Compilers section , lines 65-67 including nvcc vi reduction.cu # change 100 iterations to 1 iteration make clean; make cp ../../bin/linux/release/reduction . ./reduction --kernel=5 --n=4194304 gprof reduction ./reduction --kernel=6 --n=16777216 gprof reduction ./reduction --kernel=3 --n=64 gprof reduction # note that the host cpu is competitive when data fit within cache
Question: When would it make sense to run a kernel with 128 total threads on the device?
Porting compute PI into the CUDA reduction example. This is a classic sample compute pi MPI application:
[arnoldg@ac14 reductionPI]$ cat /home/ac/arnoldg/cuda101/cpi.c
#include "mpi.h"
#include <stdio.h>
#include <math.h>
int main( int argc, char *argv[] )
{
int n, myid, numprocs, i;
double PI25DT = 3.141592653589793238462643;
double mypi, pi, h, sum, x;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
while (1) {
if (myid == 0) {
printf("Enter the number of intervals: (0 quits) ");
scanf("%d",&n);
}
// send 1 MPI_INT “n” from rank 0 to MPI_COMM_WORLD
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (n == 0)
break;
else {
h = 1.0 / (double) n;
sum = 0.0;
// set loop stride to numprocs , each process
// will do 1/numprocs of the work calculating
// partial sums
for (i = myid + 1; i <= n; i += numprocs) {
x = h * ((double)i - 0.5);
sum += (4.0 / (1.0 + x*x));
}
mypi = h * sum;
// collect 1 MPI_DOUBLE mypi from each process and
// accumulate the result in pi
MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0,
MPI_COMM_WORLD);
if (myid == 0)
printf("pi is approximately %.16f, Error is %.16f\n",
pi, fabs(pi - PI25DT));
}
}
MPI_Finalize();
return 0;
}
[arnoldg@ac14 reductionPI]$
Question: Can we port the for loop and MPI_Reduce() from cpi.c to reduction.cu and reduction_kernel.cu ?
Imagine the for loop replaced by a fixed number of threads [16384 works well]. n from cpi.c corresponds to size in reduction.cu and each thread will need that value [h_idata near the end of reduction.cu]. Assigning size to each value of the input array [h_idata] duplicates the work of MPI_Bcast().
The work of the cpi.c for loop would end up in one of the kernel cases of reduction_kernel.cu . A sample program invocation might resemble:
../../bin/linux/release/reductionPI –kernel=1 –type=float -n=16384
This was accomplished with just a few lines of code added to the reduction project. It looks pretty simple, however debugging it was a time consuming process.
[arnoldg@qp reduction]$ diff reduction.cu reduction.cu.ok
456,459d455
< if ((whichKernel==1 ) && (datatype == REDUCE_FLOAT))
< {
< h_idata[i]= (float) size;
< }
522d517
< printf("GPU result*h = %0.12f\n", gpu_result * (1.0/size));
[arnoldg@qp reduction]$
# insert before: sdata[tid] = g_idata[i];
[arnoldg@qp reduction]$ diff reduction_kernel.cu reduction_kernel.cu.ok
105,113d104
< // These lines would have been done in a for loop with numprocs stride for
< // each mpi rank, on the gpu each thread will do a single loop iteration
< if ( g_idata[i] ==16384.0 ) // 1st trip only, all others reduce
< {
< float h= 1.0 / ( g_idata[i]);
< float x = h*( (float) i - 0.5);
< sdata[tid] = (4.0 / ( 1.0 + x*x));
< }
< else
make clean; make
[arnoldg@abe1202 reductionPI]$ ../../bin/linux/release/reductionPI --kernel=1 --type=float -n=16384 Reducing array of type float. Using Device 0: "Tesla C1060" 16384 elements 128 threads (max) 128 blocks Average time: 0.036000 ms Bandwidth: 1.820445 GB/s
GPU result = 3.141714811325
The simplest way forward is to use nvcc for everything. The nvcc compiler wrapper is somewhat more complex than the typical mpicc compiler wrapper, so it's easier to make MPI code into .cu and compile with nvcc than the other way around. A sample makefile might resemble:
[arnoldg@ac14 mpi-gpu]$ cat Makefile MPICC := nvcc -Xptxas -v MPI_INCLUDES := /usr/mpi/intel/mvapich2-1.2p1/include MPI_LIBS := /usr/mpi/intel/mvapich2-1.2p1/lib
%.o : %.cu $(MPICC) -I$(MPI_INCLUDES) -o $@ -c $<
mpi_hello_gpu : vecadd.o mpi_hello_gpu.o $(MPICC) -L$(MPI_LIBS) -lmpich -o $@ *.o
clean : rm vecadd.o mpi_hello_gpu.o
all : mpi_hello_gpu
Source code files follow.
[arnoldg@ac14 mpi-gpu]$ cat mpi_hello_gpu.cu #include <mpi.h> #include <stdio.h> #include <stdlib.h> #define PPN 4 #define INTARRAYLEN 65535 #define BCASTREPS 1000
int main(int argc, char *argv[])
{
int bcastme[INTARRAYLEN], ranksum;
int rank, size, len;
int gpudevice;
int vecadd(int, int);
char name[MPI_MAX_PROCESSOR_NAME];
MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Get_processor_name(name, &len);
// do some MPI work, showing MPI and CUDA being run from one routine
if (rank == 0) { bcastme[3]=3; }
for (int i=0; i<BCASTREPS; i++)
{
MPI_Bcast(bcastme, INTARRAYLEN, MPI_INT, 0, MPI_COMM_WORLD);
}
// modulo is useful in determining unique gpu device ids if ranks // are packed into nodes and not assigned in round robin fashion gpudevice= rank % PPN;
printf("rank %d of %d on %s received bcastme[3]=%d [gpu %d]\n", rank, size, name,bcastme[3], gpudevice);
vecadd(gpudevice, rank);
// more MPI work showing MPI is functional after CUDA
MPI_Reduce(&rank, &ranksum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0) { printf("ranksum= %d\n", ranksum); }
MPI_Finalize(); } [arnoldg@ac14 mpi-gpu]$
Parameters for passing the MPI rank and selecting a gpu were added to vecadd.
[arnoldg@ac14 mpi-gpu]$ cat vecadd.cu
// Kernel definition
__global__ void vecAdd(float* A, float* B, float* C)
{
int i = threadIdx.x;
A[i]=0;
B[i]=i;
C[i] = A[i] + B[i];
}
#include <stdio.h>
#define SIZE 10
#define KERNELINVOKES 5000000
int vecadd(int gpudevice, int rank)
{
int devcheck(int, int);
devcheck(gpudevice, rank);
float A[SIZE], B[SIZE], C[SIZE]; // Kernel invocation
float *devPtrA; float *devPtrB; float *devPtrC; int memsize= SIZE * sizeof(float);
cudaMalloc((void**)&devPtrA, memsize);
cudaMalloc((void**)&devPtrB, memsize);
cudaMalloc((void**)&devPtrC, memsize);
cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);
cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
for (int i=0; i<KERNELINVOKES; i++)
{
vecAdd<<<1, gpudevice>>>(devPtrA, devPtrB, devPtrC);
}
cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
// calculate only up to gpudevice to show the unique output
// of each rank's kernel launch
for (int i=0; i<gpudevice; i++)
printf("rank %d: C[%d]=%f\n",rank,i,C[i]);
cudaFree(devPtrA); cudaFree(devPtrA); cudaFree(devPtrA); }
int devcheck(int gpudevice, int rank)
{
int device_count=0;
int device; // used with cudaGetDevice() to verify cudaSetDevice()
cudaGetDeviceCount( &device_count);
if (gpudevice >= device_count)
{
printf("gpudevice >= device_count ... exiting\n");
exit(1);
}
cudaError_t cudareturn;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, gpudevice);
if (deviceProp.warpSize <= 1)
{
printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank);
exit(1);
}
cudareturn=cudaSetDevice(gpudevice);
if (cudareturn == cudaErrorInvalidDevice)
{
perror("cudaSetDevice returned cudaErrorInvalidDevice");
}
else
{
cudaGetDevice(&device);
printf("rank %d: cudaGetDevice()=%d\n",rank,device);
}
} [arnoldg@ac14 mpi-gpu]$
Tip:
Some MPI implementations will use locked memory along with CUDA. There's no good convention currently in place to deal with potential resource contention for locked memory between MPI and CUDA. It may make sense to avoid cudaMallocHost() and cudaMemcpy*Async() in cases where MPI also needs locked memory for buffers. Mvapich [for Infiniband clusters] requires some locked memory.
Hardware tip:
For systems with multiple PCI buses [lspci -tv] or multiple NUMA nodes [numactl –hardware] you can use the taskset command and build a memory bandwidth performance table with a few command iterations of bandwidthTest. Test the cpus with taskset -c and the gpus with –device=N.
taskset -c 3 ../../bin/linux/release/bandwidthTest --device=0 --htod
MB/s performance table for ac13
nvidia0 nvidia1 nvidia2 nvidia3
cpu0 1247 1263 1511 1504
cpu1 1576 1588 1509 1499
cpu2 1259 1257 1507 1501
cpu3 1591 1574 1509 1487
That table suggests that cpus 0,2 get along best with the nvidia2,3 devices.
That seems to agree with the value of local_cpulist from the 2nd and 3rd entries listed from "lspci -tv":
cd /sys/devices/ ; find . -name local_cpulist -exec cat {} \; # or local_cpus
...
pci0000:00/0000:00:0f.0/0000:18:00.0/0000:19:01.0/0000:1b:00.0/0000:1c:00.0/0000:1d:00.0/local_cpulist
0,2
pci0000:00/0000:00:0f.0/0000:18:00.0/0000:19:01.0/0000:1b:00.0/0000:1c:01.0/0000:1e:00.0/local_cpulist
0,2
The ordering as shown in lspci matches up to /dev/nvidiaN as a simple 0,1,2,3 numbering.
Newer Nvidia driver and hardware also map gpu numbers to pci locations:
[arnoldg@abe1391 reduction]$ cat /proc/driver/nvidia/cards/1 Model: Tesla C1060 IRQ: 16 Video BIOS: ??.??.??.??.?? Card Type: PCI-E DMA Size: 40 bits DMA Mask: 0xffffffffff Bus Location: 14.00.0 [arnoldg@abe1391 ~]$ /sbin/lspci | grep "3D controller" 12:00.0 3D controller: nVidia Corporation: Unknown device 05e7 (rev a1) 14:00.0 3D controller: nVidia Corporation: Unknown device 05e7 (rev a1)
Summary points and questions.
Cuda programming requires a hands-on approach. Check error and return codes, check for presence of a device, and check results for correctness.
What's the cost of a branch [if (rank/process/thread == 3) { take alt. Path } ]to relative performance of the following parallel programming models: MPI, OpenMP, CUDA
No fortran was harmed in this tutorial.
Until now...
http://www.pgroup.com/lit/pgi_whitepaper_accpre.pdf
http://www.clustermonkey.net//content/view/248/33/1/1/
Questions:
Would an MPI or OpenMP programmer be comfortable porting to the PGI compilers with accelerator directives?
Which programmer would have the easier path to PGI acceleration ?
Which programmer might find CUDA more appealing?