Cuda kernel not launched when creating too many doubles in __device__ function

Cuda kernel not launched when creating too many doubles in __device__ function - c

I am using Nvidia Nsight to debug my code and I noticed that my distance kernel is sometimes not launching. The code attempts to find the distance between two polygons. It does this by going through each segment of each polygon and finding the distance between the two segments. The following is my code:
__device__ double point_segment_distance(double px, double py, double x1, double y1, double x2, double y2)
{
double dx = x2 - x1;
double dy = y2 - y1;
//
//if (dx < 0.01 && dy < 0.01)
//{
// return hypot(px - x1, py - y1);
//}
//double t = ((px - x1) * dx + (py - y1) * dy) / (dx * dx + dy * dy);
//if (t < 0)
//{
// dx = px - x1;
// dy = py - y1;
//}
//else if (t > 1)
//{
// dx = px - x2;
// dy = py - y2;
//}
//else
//{
// double near_x = x1 + t * dx;
// double near_y = y1 + t * dy;
// dx = px - near_x;
// dy = py - near_y;
//}
//return hypot(dx, dy);
return 10.0;
}
__device__ bool segments_intersect(double x11, double y11, double x12, double y12, double x21, double y21, double x22, double y22)
{
double dx1 = x12 - x11;
double dy1 = y12 - y11;
double dx2 = x22 - x21;
double dy2 = y22 - y21;
double delta = dx2 * dy1 - dy2 * dx1;
if (delta < 0.01)
{
return false;
}
double s = (dx1 * (y21 - y11) + dy1 * (x11 - x21)) / delta;
double t = (dx2 * (y11 - y21) + dy2 * (x21 - x11)) / (-delta);
return (0 <= s && s <= 1 && 0 <= t && t <= 1);
}
__device__ double segments_distance(double x11, double y11, double x12, double y12, double x21, double y21, double x22, double y22)
{
if (segments_intersect(x11, y11, x12, y12, x21, y21, x22, y22))
{
return 0.0;
}
double minimumDist = 999999;
double tempDist = point_segment_distance(x11, y11, x21, y21, x22, y22);
if (tempDist < minimumDist)
{
minimumDist = tempDist;
}
tempDist = point_segment_distance(x12, y12, x21, y21, x22, y22);
if (tempDist < minimumDist)
{
minimumDist = tempDist;
}
tempDist = point_segment_distance(x21, y21, x11, y11, x12, y12);
if (tempDist < minimumDist)
{
minimumDist = tempDist;
}
tempDist = point_segment_distance(x22, y22, x11, y11, x12, y12);
if (tempDist < minimumDist)
{
minimumDist = tempDist;
}
return minimumDist;
}
__global__ void distance(double *x0, double *y0, double *x1, double *y1, double *dist, int *length0, int *length1, int *numDone)
{
int numComp = threadIdx.x + blockDim.x*blockIdx.x + *numDone;
int index = threadIdx.x + blockDim.x*blockIdx.x;
dist[index] = 99999;
if (numComp < ((*length0)*(*length1)))
{
int spot0 = numComp%(*length0);
int spot1 = numComp/(*length0);
dist[index] = segments_distance(x0[spot0], y0[spot0], x0[(spot0+1)%(*length0)], y0[(spot0+1)%(*length0)], x1[spot1], y1[spot1], x1[(spot1+1)%(*length1)], y1[(spot1+1)%(*length1)]);
}
}
void gpuDistance(double *x0, double *y0, double *x1, double *y1)
{
...
distance<<<165, 1024>>>(dev_x0, dev_y0, dev_x1, dev_y1, dev_dist, dev_length0, dev_length1, dev_numDone);
...
}
I commented out much of point_segment_distance in order to help me locate the error. This will not launch the distance kernel. I know this because I am using Nsight Cuda Debugging and it doesn't hit my breakpoints.
However, if I comment the line "double dy = y2 - y1;" in point_segment_distance the distance kernel will launch. How is this possible? Why would creating one more double cause the kernel to not launch. Is there a limit to the number of doubles that may be created on the GPU. I have a Tesla 2075. I am aware of the local memory limit of 512kb. However, looking at my code I can't imagine that I'm anywhere near that limit. Thanks for any help!

Not hitting breakpoints does not mean that the kernel is not executed since the compiler has the freedom to perform aggressive optimizations on the code. To check the correctness of kernel launches, you should better perform canonical CUDA error checking in the sense of talonmies' post
What is the canonical way to check for errors using the CUDA runtime API?
To have an idea of the optimizations the compiler can perform, consider for example the following code
__global__ void point_segment_distance(double* distance_squared, const double* __restrict__ x1, const double* __restrict__ y1, const double* __restrict__ x2, const double* __restrict__ y2)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
double dx = x2[i] - x1[i];
double dy = y2[i] - y1[i];
//distance_squared[i] = dx*dx+dy*dy;
}
Note the commented instruction. When such an instruction is illustrated, then everything inside the kernel function becomes dead code, since it will not contribute to global memory data, and is eliminated by the compiler. Indeed, the disassembled code becomes
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ EXIT ; /* 0x8000000000001de7 */
When the above instruction is uncommented, then the compiler will produce
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R2, SR_CTAID.X; /* 0x2c00000094009c04 */
/*0010*/ S2R R3, SR_TID.X; /* 0x2c0000008400dc04 */
/*0018*/ MOV32I R0, 0x8; /* 0x1800000020001de2 */
/*0020*/ IMAD R18, R2, c[0x0][0x8], R3; /* 0x2006400020249ca3 */
/*0028*/ IMAD R8.CC, R18, R0, c[0x0][0x38]; /* 0x20018000e1221ca3 */
/*0030*/ IMAD.HI.X R9, R18, R0, c[0x0][0x3c]; /* 0x20808000f1225ce3 */
/*0038*/ IMAD R16.CC, R18, R0, c[0x0][0x40]; /* 0x2001800101241ca3 */
/*0040*/ LD.E.64 R10, [R8]; /* 0x8400000000829ca5 */
/*0048*/ IMAD.HI.X R17, R18, R0, c[0x0][0x44]; /* 0x2080800111245ce3 */
/*0050*/ IMAD R12.CC, R18, R0, c[0x0][0x30]; /* 0x20018000c1231ca3 */
/*0058*/ LD.E.64 R4, [R16]; /* 0x8400000001011ca5 */
/*0060*/ IMAD.HI.X R13, R18, R0, c[0x0][0x34]; /* 0x20808000d1235ce3 */
/*0068*/ IMAD R6.CC, R18, R0, c[0x0][0x28]; /* 0x20018000a1219ca3 */
/*0070*/ LD.E.64 R2, [R12]; /* 0x8400000000c09ca5 */
/*0078*/ IMAD.HI.X R7, R18, R0, c[0x0][0x2c]; /* 0x20808000b121dce3 */
/*0080*/ LD.E.64 R14, [R6]; /* 0x8400000000639ca5 */
/*0088*/ DADD R2, R4, -R2; /* 0x4800000008409d01 */
/*0090*/ DMUL R6, R2, R2; /* 0x5000000008219c01 */
/*0098*/ DADD R4, R10, -R14; /* 0x4800000038a11d01 */
/*00a0*/ IMAD R2.CC, R18, R0, c[0x0][0x20]; /* 0x2001800081209ca3 */
/*00a8*/ DFMA R4, R4, R4, R6; /* 0x200c000010411c01 */
/*00b0*/ IMAD.HI.X R3, R18, R0, c[0x0][0x24]; /* 0x208080009120dce3 */
/*00b8*/ ST.E.64 [R2], R4; /* 0x9400000000211ca5 */
/*00c0*/ EXIT ; /* 0x8000000000001de7 */
and the code is not dead anymore.

Related

Transposing 8x8 float matrix using NEON intrinsics

I have a program that needs to run a transpose operation on 8x8 float32 matrices many times. I want to transpose these using NEON SIMD intrinsics. I know that the array will always contain 8x8 float elements. I have a baseline non-intrinsic solution below:
void transpose(float *matrix, float *matrixT) {
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 8; j++) {
matrixT[i*8+j] = matrix[j*8+i];
}
}
}
I also created an intrinsic solution that transposes each 4x4 quadrant of the 8x8 matrix, and swaps the positions of the second and third quadrants. This solution looks like this:
void transpose_4x4(float *matrix, float *matrixT, int store_index) {
float32x4_t r0, r1, r2, r3, c0, c1, c2, c3;
r0 = vld1q_f32(matrix);
r1 = vld1q_f32(matrix + 8);
r2 = vld1q_f32(matrix + 16);
r3 = vld1q_f32(matrix + 24);
c0 = vzip1q_f32(r0, r1);
c1 = vzip2q_f32(r0, r1);
c2 = vzip1q_f32(r2, r3);
c3 = vzip2q_f32(r2, r3);
r0 = vcombine_f32(vget_low_f32(c0), vget_low_f32(c2));
r1 = vcombine_f32(vget_high_f32(c0), vget_high_f32(c2));
r2 = vcombine_f32(vget_low_f32(c1), vget_low_f32(c3));
r3 = vcombine_f32(vget_high_f32(c1), vget_high_f32(c3));
vst1q_f32(matrixT + store_index, r0);
vst1q_f32(matrixT + store_index + 8, r1);
vst1q_f32(matrixT + store_index + 16, r2);
vst1q_f32(matrixT + store_index + 24, r3);
}
void transpose(float *matrix, float *matrixT) {
// Transpose top-left 4x4 quadrant and store the result in the top-left 4x4 quadrant
transpose_4x4(matrix, matrixT, 0);
// Transpose top-right 4x4 quadrant and store the result in the bottom-left 4x4 quadrant
transpose_4x4(matrix + 4, matrixT, 32);
// Transpose bottom-left 4x4 quadrant and store the result in the top-right 4x4 quadrant
transpose_4x4(matrix + 32, matrixT, 4);
// Transpose bottom-right 4x4 quadrant and store the result in the bottom-right 4x4 quadrant
transpose_4x4(matrix + 36, matrixT, 36);
}
This solution however, results in a slower performance than the baseline non-intrinsic solution. I am struggling to see, if there is one, a faster solution that can transpose my 8x8 matrix. Any help would be greatly appreciated!
Edit: both solutions are compiled using the -O1 flag.

First off, you shouldn't expect a huge performance boost to start with:
there is actually no computation
you are dealing with 32bit data, and thus, not much of bandwidth constraint.
to sum it up, just a little bit saving in bandwidth by vectorizing - that's all
As for the 4x4 transpose, you don't even need a separate function, but just a macro:
#define TRANSPOSE4x4(pSrc,pDst) vst1q_f32_x4(pDst,vld4q_f32(pSrc))
will do the job since NEON does the 4x4 transpose on the fly when you load the data with vld4.
But you should ask yourself at this point if your approach - transposing all the matrice prior to actual computation - is the right one if 4x4 transpose costs virtually nothing. This step could end up being a pure waste of computation and bandwidth. Optimization shouldn't be limited to the final step, but should be considered from the designing phase.
8x8 transpose is a different animal though:
void transpose8x8(float *pDst, float *pSrc)
{
float32x4_t row0a, row0b, row1a, row1b, row2a, row2b, row3a, row3b, row4a, row4b, row5a, row5b, row6a, row6b, row7a, row7b;
float32x4_t r0a, r0b, r1a, r1b, r2a, r2b, r3a, r3b, r4a, r4b, r5a, r5b, r6a, r6b, r7a, r7b;
row0a = vld1q_f32(pSrc);
pSrc += 4;
row0b = vld1q_f32(pSrc);
pSrc += 4;
row1a = vld1q_f32(pSrc);
pSrc += 4;
row1b = vld1q_f32(pSrc);
pSrc += 4;
row2a = vld1q_f32(pSrc);
pSrc += 4;
row2b = vld1q_f32(pSrc);
pSrc += 4;
row3a = vld1q_f32(pSrc);
pSrc += 4;
row3b = vld1q_f32(pSrc);
pSrc += 4;
row4a = vld1q_f32(pSrc);
pSrc += 4;
row4b = vld1q_f32(pSrc);
pSrc += 4;
row5a = vld1q_f32(pSrc);
pSrc += 4;
row5b = vld1q_f32(pSrc);
pSrc += 4;
row6a = vld1q_f32(pSrc);
pSrc += 4;
row6b = vld1q_f32(pSrc);
pSrc += 4;
row7a = vld1q_f32(pSrc);
pSrc += 4;
row7b = vld1q_f32(pSrc);
r0a = vtrn1q_f32(row0a, row1a);
r0b = vtrn1q_f32(row0b, row1b);
r1a = vtrn2q_f32(row0a, row1a);
r1b = vtrn2q_f32(row0b, row1b);
r2a = vtrn1q_f32(row2a, row3a);
r2b = vtrn1q_f32(row2b, row3b);
r3a = vtrn2q_f32(row2a, row3a);
r3b = vtrn2q_f32(row2b, row3b);
r4a = vtrn1q_f32(row4a, row5a);
r4b = vtrn1q_f32(row4b, row5b);
r5a = vtrn2q_f32(row4a, row5a);
r5b = vtrn2q_f32(row4b, row5b);
r6a = vtrn1q_f32(row6a, row7a);
r6b = vtrn1q_f32(row6b, row7b);
r7a = vtrn2q_f32(row6a, row7a);
r7b = vtrn2q_f32(row6b, row7b);
row0a = vtrn1q_f64(row0a, row2a);
row0b = vtrn1q_f64(row0b, row2b);
row1a = vtrn1q_f64(row1a, row3a);
row1b = vtrn1q_f64(row1b, row3b);
row2a = vtrn2q_f64(row0a, row2a);
row2b = vtrn2q_f64(row0b, row2b);
row3a = vtrn2q_f64(row1a, row3a);
row3b = vtrn2q_f64(row1b, row3b);
row4a = vtrn1q_f64(row4a, row6a);
row4b = vtrn1q_f64(row4b, row6b);
row5a = vtrn1q_f64(row5a, row7a);
row5b = vtrn1q_f64(row5b, row7b);
row6a = vtrn2q_f64(row4a, row6a);
row6b = vtrn2q_f64(row4b, row6b);
row7a = vtrn2q_f64(row5a, row7a);
row7b = vtrn2q_f64(row5b, row7b);
vst1q_f32(pDst, row0a);
pDst += 4;
vst1q_f32(pDst, row4a);
pDst += 4;
vst1q_f32(pDst, row1a);
pDst += 4;
vst1q_f32(pDst, row5a);
pDst += 4;
vst1q_f32(pDst, row2a);
pDst += 4;
vst1q_f32(pDst, row6a);
pDst += 4;
vst1q_f32(pDst, row3a);
pDst += 4;
vst1q_f32(pDst, row7a);
pDst += 4;
vst1q_f32(pDst, row0b);
pDst += 4;
vst1q_f32(pDst, row4b);
pDst += 4;
vst1q_f32(pDst, row1b);
pDst += 4;
vst1q_f32(pDst, row5b);
pDst += 4;
vst1q_f32(pDst, row2b);
pDst += 4;
vst1q_f32(pDst, row6b);
pDst += 4;
vst1q_f32(pDst, row3b);
pDst += 4;
vst1q_f32(pDst, row7b);
}
It boils down to : 16 load + 32 trn + 16 store vs 64 load + 64 store
Now we can clearly see it really isn't worth it. The neon routine above might be a little faster, but I doubt it will make a difference in the end.
No, you can't optimize it any further. Nobody can. Just make sure the pointers are 64byte aligned, test it, and decide for yourself.
ld1 {v0.4s-v3.4s}, [x1], #64
ld1 {v4.4s-v7.4s}, [x1], #64
ld1 {v16.4s-v19.4s}, [x1], #64
ld1 {v20.4s-v23.4s}, [x1]
trn1 v24.4s, v0.4s, v2.4s // row0
trn1 v25.4s, v1.4s, v3.4s
trn2 v26.4s, v0.4s, v2.4s // row1
trn2 v27.4s, v1.4s, v3.4s
trn1 v28.4s, v4.4s, v6.4s // row2
trn1 v29.4s, v5.4s, v7.4s
trn2 v30.4s, v4.4s, v6.4s // row3
trn2 v31.4s, v5.4s, v7.4s
trn1 v0.4s, v16.4s, v18.4s // row4
trn1 v1.4s, v17.4s, v19.4s
trn2 v2.4s, v16.4s, v18.4s // row5
trn2 v3.4s, v17.4s, v19.4s
trn1 v4.4s, v20.4s, v22.4s // row6
trn1 v5.4s, v21.4s, v23.4s
trn2 v6.4s, v20.4s, v22.4s // row7
trn2 v7.4s, v21.4s, v23.4s
trn1 v16.2d, v24.2d, v28.2d // row0a
trn1 v17.2d, v0.2d, v4.2d // row0b
trn1 v18.2d, v26.2d, v30.2d // row1a
trn1 v19.2d, v2.2d, v6.2d // row1b
trn2 v20.2d, v24.2d, v28.2d // row2a
trn2 v21.2d, v0.2d, v4.2d // row2b
trn2 v22.2d, v26.2d, v30.2d // row3a
trn2 v23.2d, v2.2d, v6.2d // row3b
st1 {v16.4s-v19.4s}, [x0], #64
st1 {v20.4s-v23.4s}, [x0], #64
trn1 v16.2d, v25.2d, v29.2d // row4a
trn1 v17.2d, v1.2d, v5.2d // row4b
trn1 v18.2d, v27.2d, v31.2d // row5a
trn1 v19.2d, v3.2d, v7.2d // row5b
trn2 v20.2d, v25.2d, v29.2d // row4a
trn2 v21.2d, v1.2d, v5.2d // row4b
trn2 v22.2d, v27.2d, v31.2d // row5a
trn2 v23.2d, v3.2d, v7.2d // row5b
st1 {v16.4s-v19.4s}, [x0], #64
st1 {v20.4s-v23.4s}, [x0]
ret
above is the hand optimized assembly version that's most probably shorter (as short as it can get), but not exactly meaningfully faster than:
Below is the pure C version that I'd settle with:
void transpose8x8(float *pDst, float *pSrc)
{
uint32_t i = 8;
do {
pDst[0] = *pSrc++;
pDst[8] = *pSrc++;
pDst[16] = *pSrc++;
pDst[24] = *pSrc++;
pDst[32] = *pSrc++;
pDst[40] = *pSrc++;
pDst[48] = *pSrc++;
pDst[56] = *pSrc++;
pDst++;
} while (--i);
}
or
void transpose8x8(float *pDst, float *pSrc)
{
uint32_t i = 8;
do {
*pDst++ = pSrc[0];
*pDst++ = pSrc[8];
*pDst++ = pSrc[16];
*pDst++ = pSrc[24];
*pDst++ = pSrc[32];
*pDst++ = pSrc[40];
*pDst++ = pSrc[48];
*pDst++ = pSrc[56];
pSrc++;
} while (--i);
}
PS: It could bring some gain in performance/power consumption if you declared pDst and pSrc uint32_t *, because the compiler would definitely generate pure integer machine code which has most various addressing modes, and only use w registers instead of s ones. Just typecase float * to uint32_t *
PS2: Clang already utilizes w registers instead of s ones while GCC is being GCC.... When will GNU-shills finally admit the fact that GCC is an extremely bad choice for ARM?
godbolt
PS3: Below is the non-neon version in assembly (zero latency) since I was very disappointed (even shocked) in both Clang and GCC above:
.arch armv8-a
.global transpose8x8
.text
.balign 64
.func
transpose8x8:
mov w10, #8
sub x0, x0, #8
.balign 16
1:
ldr w2, [x1, #0]
ldr w3, [x1, #32]
ldr w4, [x1, #64]
ldr w5, [x1, #96]
ldr w6, [x1, #128]
ldr w7, [x1, #160]
ldr w8, [x1, #192]
ldr w9, [x1, #224]
subs w10, w10, #1
stp w2, w3, [x0, #8]
add x1, x1, #4
stp w4, w5, [x0, #16]
stp w6, w7, [x0, #24]
stp w8, w9, [x0, #32]!
b.ne 1b
.balign 16
ret
.endfunc
.end
It's arguably the best version you will ever get if you still insist on doing pure 8x8 transpose. It might be a little slower than the neon assembly version, but consume considerably less power.

It's possible to optimise the 8x8 neon code presented in the other answer; 8x8 transpose can be not only thought of as recursive version of [A B;C D]' == [A' C'; B' D'] but also as repeated application of zip or unzip.
a b c d
e f g h
i j k l
m n o p == a b c d e f g h i j k l m n o p
zip(first_half, last_half) ==
zip(...) == a i b j c k d l e m f n g o h p
zip(...) == a e i m b f j n c g k o d h l p == transpose
For 8x8 matrix we need to apply this algorithm 3 times and reading the data by vld4 two of those passes have been already done.
float32x4x4_t d0 = vld4q_f32(input);
float32x4x4_t d1 = vld4q_f32(input + 16);
float32x4x4_t d2 = vld4q_f32(input + 32);
float32x4x4_t d3 = vld4q_f32(input + 48);
float32x4x4_t e0 = {
vzipq_f32(d0.val[0], d2.val[0]).val[0],
vzipq_f32(d0.val[1], d2.val[1]).val[0],
vzipq_f32(d0.val[2], d2.val[2]).val[0],
vzipq_f32(d0.val[3], d2.val[3]).val[0]
};
float32x4x4_t e1 = {
vzipq_f32(d1.val[0], d3.val[0]).val[0],
vzipq_f32(d1.val[1], d3.val[1]).val[0],
vzipq_f32(d1.val[2], d3.val[2]).val[0],
vzipq_f32(d1.val[3], d3.val[3]).val[0]
};
float32x4x4_t e2 = {
vzipq_f32(d0.val[0], d2.val[0]).val[1],
vzipq_f32(d0.val[1], d2.val[1]).val[1],
vzipq_f32(d0.val[2], d2.val[2]).val[1],
vzipq_f32(d0.val[3], d2.val[3]).val[1]
};
float32x4x4_t e3 = {
vzipq_f32(d1.val[0], d3.val[0]).val[1],
vzipq_f32(d1.val[1], d3.val[1]).val[1],
vzipq_f32(d1.val[2], d3.val[2]).val[1],
vzipq_f32(d1.val[3], d3.val[3]).val[1]
};
vst1q_f32_x4(output, e0);
vst1q_f32_x4(output + 16, e1);
vst1q_f32_x4(output + 32, e2);
vst1q_f32_x4(output + 48, e3);
One should be able to perform the transpose also by starting with vld1q_f32_x4, then uzpq and finish with vst4q_f32.

rewritting C matrix multiplication to ARM(64) assembly

I'm trying to pass a matrix multiplication written in C to ARM assembly manually. The matrices are always square matrices defined by TAM and I'm using unidimensional arrays like you can see here. The original C code is:
void multiply(int *a, int *b, int *c){
int i = 0;
while(i < TAM){
int j = 0;
while(j < TAM){
int k = 0;
while(k < TAM){
c[i*TAM + j] = c[i*TAM + j] + a[i*TAM + k]*b[k*TAM + j];
k++;
}
j++;
}
i++;
}
}
And the ARM code is getting segfault in store instruction but I cannot find the cause:
ldr x0, =matriz1
ldr x1, =matriz2
ldr x2, =matriz3
mov x3, #10 //TAM = 10
mov x4, #0 //i = 0
loop_i: //while(i<TAM){
mov x5, #0 //j = 0
loop_j: //while(j<TAM){
mov x6, #0 //k = 0
loop_k: //while(k<TAM){
madd x7, x3, x4, x6 //x7 <- i*TAM + k
ldr x7, [x0, x7] //a[x7]
madd x8, x6, x3, x5 //x8 <- k*TAM + j
ldr x8, [x1, x8] //b[x8]
madd x9, x3, x4, x5 //x9 <- i*TAM + j
ldr x10, [x2, x9] //c[x9]
madd x10, x7, x8, x10 //c[i*TAM + j] + a[i*TAM + k]*b[k*TAM + j]
str x10, [x2, x9] //c[i*TAM + j] = c[i*TAM + j] + a[i*TAM + k]*b[k*TAM + j]
end_loop_k:
add x6, x6, #1 //k++
cmp x6, #10
blt loop_k // }
end_loop_j:
add x5, x5, #1 //j++
cmp x5, #10
blt loop_j // }
end_loop_i:
add x4, x4, #1 //i++
cmp x4, #10
blt loop_i //}
The comments are related to the original C code.
Do you have any idea why the segfault?

Can I ensure that NVCC has managed to place an array in registers?

A CUDA kernel with some local, fixed-size array may get compiled so that the array resides in the thread's "local memory", or - if NVCC can determine the position of each array access at compile time, and there are enough registers available - the array might be broken up with its elements residing in registers.
Is it possible to check or to ensure, either via the code or as part of the build process, that a specific array, or all local arrays in a kernel, have been fit into registers? Is doing so supported by any tool?

At runtime
You may use the CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES as a hint on whether your array has been registrified; using CUDA driver API function cuFuncGetAttribute. But for some use cases, runtime may be too late.
At compile time
You want to have a look at the generated ptx file (using --keep option in nvcc).
The local data declation is identified as .local in the ptx. Here is a small example, with a kernel.
#define ww 65
__global__ void kernel(int W, int H, const int *a, int *b)
{
int buffer[ww];
for (int i = threadIdx.x; i < H; i += blockDim.x)
{
#pragma unroll
for (int w = 0; w < ww; ++w)
buffer[w] = a[i + w * W];
for (int j = 5; j < H - 5; ++j)
{
buffer[j % ww] = a[i + (j + 6) * W];
int s = 0;
#pragma unroll
for (int w = 0; w < ww; ++w)
s += buffer[w];
b[i + (j + 6) * W] = s;
}
}
}
When compiled there is a declaration of a local variable:
.visible .entry _Z6kerneliiPKiPi(
.param .u32 _Z6kerneliiPKiPi_param_0,
.param .u32 _Z6kerneliiPKiPi_param_1,
.param .u64 _Z6kerneliiPKiPi_param_2,
.param .u64 _Z6kerneliiPKiPi_param_3
)
{
.local .align 4 .b8 __local_depot0[260];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b32 %r<219>;
.reg .b64 %rd<81>;
However, when rolling the buffer, buffer is always accessed with known indices and registers may be obtained - no local storage:
#define ww 65
__global__ void kernel(int W, int H, const int *a, int *b)
{
int buffer[ww];
for (int i = threadIdx.x; i < H; i += blockDim.x)
{
#pragma unroll
for (int w = 0; w < ww; ++w)
buffer[w] = a[i + w * W];
for (int j = 5; j < H - 5; ++j)
{
#pragma unroll
for (int w = 0; w < ww-1; ++w)
buffer[w] = buffer[w + 1];
buffer[ww - 1] = a[i + (j + 6) * W];
int s = 0;
#pragma unroll
for (int w = 0; w < ww; ++w)
s += buffer[w];
b[i + (j + 6) * W] = s;
}
}
}
Yields the following ptx:
.visible .entry _Z6kerneliiPKiPi(
.param .u32 _Z6kerneliiPKiPi_param_0,
.param .u32 _Z6kerneliiPKiPi_param_1,
.param .u64 _Z6kerneliiPKiPi_param_2,
.param .u64 _Z6kerneliiPKiPi_param_3
)
{
.reg .pred %p<5>;
.reg .b32 %r<393>;
.reg .b64 %rd<240>;
Note that depending on the number of registers available, the number of required registers may not fit. These are virtual registers (which has somehow changed in recent versions of CUDA). Meaning that the absence of .local .align 4 .b8 __local_depot is a prerequisite, but not sufficient.
You need to look at the SASS then. Using nvdisasm on your generated .cubin, you want to search for STL instruction which stands for STore Local, as described briefly here. Here are parts of the two disassembled cubins compiled with two different values of --maxrregcount compiler switch - first for 32 (see the many occurrences of STL):
//--------------------- .text._Z6kerneliiPKiPi --------------------------
.section .text._Z6kerneliiPKiPi,"ax",#progbits
.sectioninfo #"SHI_REGISTERS=32"
.align 32
.global _Z6kerneliiPKiPi
.type _Z6kerneliiPKiPi,#function
.size _Z6kerneliiPKiPi,(.L_25 - _Z6kerneliiPKiPi)
.other _Z6kerneliiPKiPi,#"STO_CUDA_ENTRY STV_DEFAULT"
_Z6kerneliiPKiPi:
.text._Z6kerneliiPKiPi:
/*0008*/ MOV R1, c[0x0][0x20];
/*0010*/ { IADD32I R1, R1, -0x180;
/*0018*/ S2R R0, SR_TID.X; }
/*0028*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x144], PT;
/*0030*/ NOP;
/*0038*/ NOP;
/*0048*/ #P0 EXIT;
.L_3:
/*0050*/ IADD R2, R0, c[0x0][0x140];
/*0058*/ MOV R30, c[0x0][0x140];
/*0068*/ ISCADD R5.CC, R2.reuse, c[0x0][0x148], 0x2;
/*0070*/ { SHR R3, R2, 0x1e;
/*0078*/ STL [R1+0x14], R5; }
/*0088*/ ISCADD R2, R30.reuse, R0.reuse, 0x1;
/*0090*/ ISCADD R4, R30.reuse, R0.reuse, 0x2;
/*0098*/ ISCADD R20, R30, R0, 0x3;
/*00a8*/ IADD.X R5, R3, c[0x0][0x14c];
/*00b0*/ { SHR R3, R2.reuse, 0x1e;
/*00b8*/ STL [R1+0x10], R5; }
/*00c8*/ ISCADD R2.CC, R2, c[0x0][0x148], 0x2;
/*00d0*/ STL [R1+0x8], R2;
/*00d8*/ SHR R5, R4, 0x1e;
/*00e8*/ IADD.X R2, R3, c[0x0][0x14c];
/*00f0*/ { ISCADD R4.CC, R4, c[0x0][0x148], 0x2;
/*00f8*/ STL [R1+0x4], R2; }
Then for 255 - no occurence of STL:
//--------------------- .text._Z6kerneliiPKiPi --------------------------
.section .text._Z6kerneliiPKiPi,"ax",#progbits
.sectioninfo #"SHI_REGISTERS=124"
.align 32
.global _Z6kerneliiPKiPi
.type _Z6kerneliiPKiPi,#function
.size _Z6kerneliiPKiPi,(.L_25 - _Z6kerneliiPKiPi)
.other _Z6kerneliiPKiPi,#"STO_CUDA_ENTRY STV_DEFAULT"
_Z6kerneliiPKiPi:
.text._Z6kerneliiPKiPi:
/*0008*/ MOV R1, c[0x0][0x20];
/*0010*/ S2R R0, SR_TID.X;
/*0018*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x144], PT;
/*0028*/ NOP;
/*0030*/ NOP;
/*0038*/ #P0 EXIT;
/*0048*/ MOV R46, c[0x0][0x144];
/*0050*/ IADD R47, RZ, -c[0x0][0x140];
/*0058*/ IADD32I R46, R46, -0x5;
/*0068*/ SHL R47, R47, 0x2;
.L_3:
/*0070*/ ISETP.LT.AND P0, PT, R46, 0x6, PT;
/*0078*/ #P0 BRA `(.L_1);
/*0088*/ MOV R2, c[0x0][0x140];
/*0090*/ ISCADD R2, R2, R0, 0x6;
/*0098*/ SHR R27, R2.reuse, 0x1e;
/*00a8*/ ISCADD R26.CC, R2, c[0x0][0x148], 0x2;
/*00b0*/ SHR R48, R47, 0x1f;
/*00b8*/ IADD.X R27, R27, c[0x0][0x14c];
/*00c8*/ { IADD R44.CC, R47.reuse, R26;
/*00d0*/ LDG.E R49, [R26]; }
/*00d8*/ IADD.X R45, R48.reuse, R27;
/*00e8*/ { IADD R42.CC, R47.reuse, R44 SLOT 0;
/*00f0*/ LDG.E R44, [R44] SLOT 1; }
/*00f8*/ IADD.X R43, R48.reuse, R45;
/*0108*/ { IADD R38.CC, R47, R42 SLOT 0;
/*0110*/ LDG.E R42, [R42] SLOT 1; }
Very much like you I assume, I wish that all of this was better documented.

Float4 not faster than float in cuda

Edit: njuffa is right this version was compiled with -G which disables all optimizations. The new SASS is way faster as loads and stores are vectored.
Based on classic examples, I have modified two versions of vector addition in cuda. The thing is, the float4 version is twice as long as the float version with 4 times less data size. Profiling both kernels shows clearly the float4 version is performing on average 4 loads and 4 stores per transactions while float version does only one for both. It sounds like a noob question about misaligned access to float4, which BTW is confirmed by PTX below, but I can't find where.
I am using Cuda 7.0 rc with quadro K4000.
Any ideas on where to look?
Compile options ?
__aligned__ keyword ?
__global__ void add_float(float *c, const float *a, const float *b)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i] + b[i];
}
__global__ void add_float4(float4 *c, const float4 *a, const float4 *b) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
float4 a1 = a[i];
float4 b1 = b[i];
float4 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
c1.z = a1.z + b1.z;
c1.w = a1.w + b1.w;
c[i] = c1;
}
PTX on the line:
float4 a1 = a[i];
says:
...
ld.f32 %f1, [%rd6];
ld.f32 %f2, [%rd6+4];
ld.f32 %f3, [%rd6+8];
ld.f32 %f4, [%rd6+12];
st.f32 [%SP+12], %f4;
st.f32 [%SP+8], %f3;
st.f32 [%SP+4], %f2;
st.f32 [%SP+0], %f1;
...
The SASS objdump says:
/*0108*/ MOV R10, R0; /* 0x2800000000029de4 */
/*0110*/ ISET.LT.AND R11, R0, RZ, PT; /* 0x108e0000fc02dc23 */
/*0118*/ MOV32I R13, 0x4; /* 0x1800000010035de2 */
/*0120*/ ISETP.LE.U32.AND P0, PT, R13, 0x20, PT; /* 0x198ec00080d1dc03 */
/*0128*/ ISUB R12, 0x20, R13; /* 0x4800c00080d31e03 */
/*0130*/ SHL R11, R11, R13; /* 0x6000000034b2dc03 */
/*0138*/ SHR.U32 R14, R10, R12; /* 0x5800000030a39c03 */
/* 0x22c2804282328047 */
/*0148*/ IADD R11, R11, R14; /* 0x4800000038b2dc03 */
/*0150*/ #!P0 IADD R12, R13, -0x20; /* 0x4800ffff80d32003 */
/*0158*/ #!P0 SHL R11, R10, R12; /* 0x6000000030a2e003 */
/*0160*/ SHL R10, R10, R13; /* 0x6000000034a29c03 */
/*0168*/ MOV R10, R10; /* 0x2800000028029de4 */
/*0170*/ MOV R11, R11; /* 0x280000002c02dde4 */
/*0178*/ IADD R8.CC, R8, R10; /* 0x4801000028821c03 */
/* 0x228042c042828047 */
/*0188*/ IADD.X R9, R9, R11; /* 0x480000002c925c43 */
/*0190*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0198*/ MOV R9, R9; /* 0x2800000024025de4 */
/*01a0*/ LD.E R10, [R8]; /* 0x8400000000829c85 */
/*01a8*/ IADD R12.CC, R8, 0x4; /* 0x4801c00010831c03 */
/*01b0*/ IADD.X R13, R9, RZ; /* 0x48000000fc935c43 */
/*01b8*/ MOV R12, R12; /* 0x2800000030031de4 */
/* 0x2202828042c2e287 */
/*01c8*/ MOV R13, R13; /* 0x2800000034035de4 */
/*01d0*/ LD.E R11, [R12]; /* 0x8400000000c2dc85 */
/*01d8*/ IADD R12.CC, R8, 0x8; /* 0x4801c00020831c03 */
/*01e0*/ IADD.X R13, R9, RZ; /* 0x48000000fc935c43 */
/*01e8*/ MOV R12, R12; /* 0x2800000030031de4 */
/*01f0*/ MOV R13, R13; /* 0x2800000034035de4 */
/*01f8*/ LD.E R12, [R12]; /* 0x8400000000c31c85 */
/* 0x2282c202828042c7 */
/*0208*/ IADD R8.CC, R8, 0xc; /* 0x4801c00030821c03 */
/*0210*/ IADD.X R9, R9, RZ; /* 0x48000000fc925c43 */
/*0218*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0220*/ MOV R9, R9; /* 0x2800000024025de4 */
/*0228*/ LD.E R8, [R8]; /* 0x8400000000821c85 */
/*0230*/ IADD R14.CC, R2, 0xc; /* 0x4801c00030239c03 */
/*0238*/ IADD.X R15, R3, RZ; /* 0x48000000fc33dc43 */
/* 0x22828042c2e28047 */
/*0248*/ MOV R14, R14; /* 0x2800000038039de4 */
/*0250*/ MOV R15, R15; /* 0x280000003c03dde4 */
/*0258*/ ST.E [R14], R8; /* 0x9400000000e21c85 */
/*0260*/ IADD R8.CC, R2, 0x8; /* 0x4801c00020221c03 */
/*0268*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*0270*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0278*/ MOV R9, R9; /* 0x2800000024025de4 */
/* 0x22c2e2828042c2e7 */
/*0288*/ ST.E [R8], R12; /* 0x9400000000831c85 */
/*0290*/ IADD R8.CC, R2, 0x4; /* 0x4801c00010221c03 */
/*0298*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*02a0*/ MOV R8, R8; /* 0x2800000020021de4 */
/*02a8*/ MOV R9, R9; /* 0x2800000024025de4 */
/*02b0*/ ST.E [R8], R11; /* 0x940000000082dc85 */
/*02b8*/ IADD R8.CC, R2, RZ; /* 0x48010000fc221c03 */
/* 0x22820042e2828047 */
/*02c8*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*02d0*/ MOV R8, R8; /* 0x2800000020021de4 */
/*02d8*/ MOV R9, R9; /* 0x2800000024025de4 */
/*02e0*/ ST.E [R8], R10; /* 0x9400000000829c85 */
Here is the rest:
void CudaTest()
{
int size = 8192;
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
float *host_a = (float*)malloc(4 * size * sizeof(float));
float *host_b = (float*)malloc(4 * size * sizeof(float));
float *host_c = (float*)malloc(4 * size * sizeof(float));
float4 *dev_a4 = 0;
float4 *dev_b4 = 0;
float4 *dev_c4 = 0;
float4 *host_a4 = (float4*)malloc(size * sizeof(float4));
float4 *host_b4 = (float4*)malloc(size * sizeof(float4));
float4 *host_c4 = (float4*)malloc(size * sizeof(float4));
for (int i = 0; i < 4 * size; i++)
{
host_a[i] = rand() / RAND_MAX;
host_b[i] = rand() / RAND_MAX;
}
for (int i = 0; i < size; i++)
{
host_a4[i].x = rand() / RAND_MAX;
host_a4[i].y = rand() / RAND_MAX;
host_a4[i].z = rand() / RAND_MAX;
host_a4[i].w = rand() / RAND_MAX;
host_b4[i].x = rand() / RAND_MAX;
host_b4[i].y = rand() / RAND_MAX;
host_b4[i].z = rand() / RAND_MAX;
host_b4[i].w = rand() / RAND_MAX;
}
// Choose which GPU to run on, change this on a multi-GPU system.
CUDA_CALL(cudaSetDevice(0));
// Allocate GPU buffers for three vectors (two input, one output) .
CUDA_CALL(cudaMalloc((void**)&dev_c, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_a, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_b, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_c4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_a4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_b4, size * sizeof(float4)));
// Copy input vectors from host memory to GPU buffers.
CUDA_CALL(cudaMemcpy(dev_a, host_a, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b, host_b, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_a4, host_a4, size * sizeof(float4), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b4, host_b4, size * sizeof(float4), cudaMemcpyHostToDevice));
int local = 256;
int N = size / local;
// Launch a kernel on the GPU with one thread for each element.
add_float << <4*N, local >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());
add_float4 << <N, local >> >(dev_c4, dev_a4, dev_b4);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
CUDA_CALL(cudaDeviceSynchronize());
// Copy output vector from GPU buffer to host memory.
CUDA_CALL(cudaMemcpy(host_c, dev_c, 4 * size * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(host_c4, dev_c4, size * sizeof(float4), cudaMemcpyDeviceToHost));
}

Use of vector load/store instructions provided by the GPU hardware is considered a performance optimization applied by the compiler, as the code is fully functional using scalar loads and stores. When code is compiled by nvcc with -G (usually for debugging), all optimizations including the vectorization of loads and stores are turned off.
To check for load/store vectorization, it is important to look at the actual machine code (SASS) that is being executed, rather than PTX, which is merely an intermediate code that gets compiled into SASS by an optimizing compiler component called ptxas which is invoked by the driver program nvcc. Run cuobjdump --dump-sass on the executable produced by nvcc to inspect the machine code.

Efficiency of CUDA vector types (float2, float3, float4)

I'm trying to understand the integrate_functor in particles_kernel.cu from CUDA examples:
struct integrate_functor
{
float deltaTime;
//constructor for functor
//...
template <typename Tuple>
__device__
void operator()(Tuple t)
{
volatile float4 posData = thrust::get<2>(t);
volatile float4 velData = thrust::get<3>(t);
float3 pos = make_float3(posData.x, posData.y, posData.z);
float3 vel = make_float3(velData.x, velData.y, velData.z);
// update position and velocity
// ...
// store new position and velocity
thrust::get<0>(t) = make_float4(pos, posData.w);
thrust::get<1>(t) = make_float4(vel, velData.w);
}
};
We call make_float4(pos, age) but make_float4 is defined in vector_functions.h as
static __inline__ __host__ __device__ float4 make_float4(float x, float y, float z, float w)
{
float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
Are CUDA vector types (float3 and float4) more efficient for the GPU and how does the compiler know how to overload the function make_float4?

I'm expanding njuffa's comment into a worked example. In that example, I'm simply adding two arrays in three different ways: loading the data as float, float2 or float4.
These are the timings on a GT540M and on a Kepler K20c card:
GT540M
float - Elapsed time: 74.1 ms
float2 - Elapsed time: 61.0 ms
float4 - Elapsed time: 56.1 ms
Kepler K20c
float - Elapsed time: 4.4 ms
float2 - Elapsed time: 3.3 ms
float4 - Elapsed time: 3.2 ms
As it can be seen, loading the data as float4 is the fastest approach.
Below are the disassembled codes for the three kernels (compilation for compute capability 2.1).
add_float
Function : _Z9add_floatPfS_S_j
.headerflags #"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0010*/ SHL R2, R2, 0x2; /* 0x6000c00008209c03 */
/*0018*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0020*/ SHL R0, R0, 0x2; /* 0x6000c00008001c03 */
/*0028*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0030*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc03 */
/*0038*/ #P0 BRA.U 0xd8; /* 0x40000002600081e7 */
/*0040*/ #!P0 ISCADD R2, R0, c[0x0][0x24], 0x2; /* 0x400040009000a043 */
/*0048*/ #!P0 ISCADD R10, R0, c[0x0][0x20], 0x2; /* 0x400040008002a043 */
/*0050*/ #!P0 ISCADD R0, R0, c[0x0][0x28], 0x2; /* 0x40004000a0002043 */
/*0058*/ #!P0 LD R8, [R2]; /* 0x8000000000222085 */
/*0060*/ #!P0 LD R6, [R2+0x4]; /* 0x800000001021a085 */
/*0068*/ #!P0 LD R4, [R2+0x8]; /* 0x8000000020212085 */
/*0070*/ #!P0 LD R9, [R10]; /* 0x8000000000a26085 */
/*0078*/ #!P0 LD R7, [R10+0x4]; /* 0x8000000010a1e085 */
/*0080*/ #!P0 LD R5, [R10+0x8]; /* 0x8000000020a16085 */
/*0088*/ #!P0 LD R3, [R10+0xc]; /* 0x8000000030a0e085 */
/*0090*/ #!P0 LD R2, [R2+0xc]; /* 0x800000003020a085 */
/*0098*/ #!P0 FADD R8, R9, R8; /* 0x5000000020922000 */
/*00a0*/ #!P0 FADD R6, R7, R6; /* 0x500000001871a000 */
/*00a8*/ #!P0 FADD R4, R5, R4; /* 0x5000000010512000 */
/*00b0*/ #!P0 ST [R0], R8; /* 0x9000000000022085 */
/*00b8*/ #!P0 FADD R2, R3, R2; /* 0x500000000830a000 */
/*00c0*/ #!P0 ST [R0+0x4], R6; /* 0x900000001001a085 */
/*00c8*/ #!P0 ST [R0+0x8], R4; /* 0x9000000020012085 */
/*00d0*/ #!P0 ST [R0+0xc], R2; /* 0x900000003000a085 */
/*00d8*/ EXIT; /* 0x8000000000001de7 */
add_float2
Function : _Z10add_float2P6float2S0_S0_j
.headerflags #"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0010*/ SHL R2, R2, 0x1; /* 0x6000c00004209c03 */
/*0018*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0020*/ SHL R0, R0, 0x1; /* 0x6000c00004001c03 */
/*0028*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0030*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc03 */
/*0038*/ #P0 BRA.U 0xa8; /* 0x40000001a00081e7 */
/*0040*/ #!P0 ISCADD R10, R0, c[0x0][0x20], 0x3; /* 0x400040008002a063 */
/*0048*/ #!P0 ISCADD R11, R0, c[0x0][0x24], 0x3; /* 0x400040009002e063 */
/*0050*/ #!P0 ISCADD R0, R0, c[0x0][0x28], 0x3; /* 0x40004000a0002063 */
/*0058*/ #!P0 LD.64 R4, [R10]; /* 0x8000000000a120a5 */
/*0060*/ #!P0 LD.64 R8, [R11]; /* 0x8000000000b220a5 */
/*0068*/ #!P0 LD.64 R2, [R10+0x8]; /* 0x8000000020a0a0a5 */
/*0070*/ #!P0 LD.64 R6, [R11+0x8]; /* 0x8000000020b1a0a5 */
/*0078*/ #!P0 FADD R9, R5, R9; /* 0x5000000024526000 */
/*0080*/ #!P0 FADD R8, R4, R8; /* 0x5000000020422000 */
/*0088*/ #!P0 FADD R3, R3, R7; /* 0x500000001c30e000 */
/*0090*/ #!P0 FADD R2, R2, R6; /* 0x500000001820a000 */
/*0098*/ #!P0 ST.64 [R0], R8; /* 0x90000000000220a5 */
/*00a0*/ #!P0 ST.64 [R0+0x8], R2; /* 0x900000002000a0a5 */
/*00a8*/ EXIT; /* 0x8000000000001de7 */
add_float4
Function : _Z10add_float4P6float4S0_S0_j
.headerflags #"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ NOP; /* 0x4000000000001de4 */
/*0010*/ MOV R3, c[0x0][0x2c]; /* 0x28004000b000dde4 */
/*0018*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0020*/ SHR.U32 R3, R3, 0x2; /* 0x5800c0000830dc03 */
/*0028*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0030*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0038*/ ISETP.GE.U32.AND P0, PT, R0, R3, PT; /* 0x1b0e00000c01dc03 */
/*0040*/ #P0 BRA.U 0x98; /* 0x40000001400081e7 */
/*0048*/ #!P0 ISCADD R2, R0, c[0x0][0x20], 0x4; /* 0x400040008000a083 */
/*0050*/ #!P0 ISCADD R3, R0, c[0x0][0x24], 0x4; /* 0x400040009000e083 */
/*0058*/ #!P0 ISCADD R0, R0, c[0x0][0x28], 0x4; /* 0x40004000a0002083 */
/*0060*/ #!P0 LD.128 R8, [R2]; /* 0x80000000002220c5 */
/*0068*/ #!P0 LD.128 R4, [R3]; /* 0x80000000003120c5 */
/*0070*/ #!P0 FADD R7, R11, R7; /* 0x500000001cb1e000 */
/*0078*/ #!P0 FADD R6, R10, R6; /* 0x5000000018a1a000 */
/*0080*/ #!P0 FADD R5, R9, R5; /* 0x5000000014916000 */
/*0088*/ #!P0 FADD R4, R8, R4; /* 0x5000000010812000 */
/*0090*/ #!P0 ST.128 [R0], R4; /* 0x90000000000120c5 */
/*0098*/ EXIT; /* 0x8000000000001de7 */
As it can be seen and as mentioned by njuffa, different load instructions are used for the three cases: LD, LD.64 and LD.128, respectively.
Finally, the code:
#include <thrust/device_vector.h>
#define BLOCKSIZE 256
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********************/
/* ADD_FLOAT KERNEL */
/********************/
__global__ void add_float(float *d_a, float *d_b, float *d_c, unsigned int N) {
const int tid = 4 * threadIdx.x + blockIdx.x * (4 * blockDim.x);
if (tid < N) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float a2 = d_a[tid+1];
float b2 = d_b[tid+1];
float a3 = d_a[tid+2];
float b3 = d_b[tid+2];
float a4 = d_a[tid+3];
float b4 = d_b[tid+3];
float c1 = a1 + b1;
float c2 = a2 + b2;
float c3 = a3 + b3;
float c4 = a4 + b4;
d_c[tid] = c1;
d_c[tid+1] = c2;
d_c[tid+2] = c3;
d_c[tid+3] = c4;
//if ((tid < 1800) && (tid > 1790)) {
//printf("%i %i %i %f %f %f\n", tid, threadIdx.x, blockIdx.x, a1, b1, c1);
//printf("%i %i %i %f %f %f\n", tid+1, threadIdx.x, blockIdx.x, a2, b2, c2);
//printf("%i %i %i %f %f %f\n", tid+2, threadIdx.x, blockIdx.x, a3, b3, c3);
//printf("%i %i %i %f %f %f\n", tid+3, threadIdx.x, blockIdx.x, a4, b4, c4);
//}
}
}
/*********************/
/* ADD_FLOAT2 KERNEL */
/*********************/
__global__ void add_float2(float2 *d_a, float2 *d_b, float2 *d_c, unsigned int N) {
const int tid = 2 * threadIdx.x + blockIdx.x * (2 * blockDim.x);
if (tid < N) {
float2 a1 = d_a[tid];
float2 b1 = d_b[tid];
float2 a2 = d_a[tid+1];
float2 b2 = d_b[tid+1];
float2 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
float2 c2;
c2.x = a2.x + b2.x;
c2.y = a2.y + b2.y;
d_c[tid] = c1;
d_c[tid+1] = c2;
}
}
/*********************/
/* ADD_FLOAT4 KERNEL */
/*********************/
__global__ void add_float4(float4 *d_a, float4 *d_b, float4 *d_c, unsigned int N) {
const int tid = 1 * threadIdx.x + blockIdx.x * (1 * blockDim.x);
if (tid < N/4) {
float4 a1 = d_a[tid];
float4 b1 = d_b[tid];
float4 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
c1.z = a1.z + b1.z;
c1.w = a1.w + b1.w;
d_c[tid] = c1;
}
}
/********/
/* MAIN */
/********/
int main() {
const int N = 4*10000000;
const float a = 3.f;
const float b = 5.f;
// --- float
thrust::device_vector<float> d_A(N, a);
thrust::device_vector<float> d_B(N, b);
thrust::device_vector<float> d_C(N);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add_float<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_A.data()), thrust::raw_pointer_cast(d_B.data()), thrust::raw_pointer_cast(d_C.data()), N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<float> h_float = d_C;
for (int i=0; i<N; i++) {
if (h_float[i] != (a+b)) {
printf("Error for add_float at %i: result is %f\n",i, h_float[i]);
return -1;
}
}
// --- float2
thrust::device_vector<float> d_A2(N, a);
thrust::device_vector<float> d_B2(N, b);
thrust::device_vector<float> d_C2(N);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add_float2<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>((float2*)thrust::raw_pointer_cast(d_A2.data()), (float2*)thrust::raw_pointer_cast(d_B2.data()), (float2*)thrust::raw_pointer_cast(d_C2.data()), N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<float> h_float2 = d_C2;
for (int i=0; i<N; i++) {
if (h_float2[i] != (a+b)) {
printf("Error for add_float2 at %i: result is %f\n",i, h_float2[i]);
return -1;
}
}
// --- float4
thrust::device_vector<float> d_A4(N, a);
thrust::device_vector<float> d_B4(N, b);
thrust::device_vector<float> d_C4(N);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add_float4<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>((float4*)thrust::raw_pointer_cast(d_A4.data()), (float4*)thrust::raw_pointer_cast(d_B4.data()), (float4*)thrust::raw_pointer_cast(d_C4.data()), N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<float> h_float4 = d_C4;
for (int i=0; i<N; i++) {
if (h_float4[i] != (a+b)) {
printf("Error for add_float4 at %i: result is %f\n",i, h_float4[i]);
return -1;
}
}
return 0;
}

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Cuda kernel not launched when creating too many doubles in device function - c

Related

Transposing 8x8 float matrix using NEON intrinsics

rewritting C matrix multiplication to ARM(64) assembly

Can I ensure that NVCC has managed to place an array in registers?

Float4 not faster than float in cuda

Efficiency of CUDA vector types (float2, float3, float4)

Categories

Resources