Float4 not faster than float in cuda - c

Edit: njuffa is right this version was compiled with -G which disables all optimizations. The new SASS is way faster as loads and stores are vectored.
Based on classic examples, I have modified two versions of vector addition in cuda. The thing is, the float4 version is twice as long as the float version with 4 times less data size. Profiling both kernels shows clearly the float4 version is performing on average 4 loads and 4 stores per transactions while float version does only one for both. It sounds like a noob question about misaligned access to float4, which BTW is confirmed by PTX below, but I can't find where.
I am using Cuda 7.0 rc with quadro K4000.
Any ideas on where to look?
Compile options ?
__aligned__ keyword ?
__global__ void add_float(float *c, const float *a, const float *b)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i] + b[i];
}
__global__ void add_float4(float4 *c, const float4 *a, const float4 *b) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
float4 a1 = a[i];
float4 b1 = b[i];
float4 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
c1.z = a1.z + b1.z;
c1.w = a1.w + b1.w;
c[i] = c1;
}
PTX on the line:
float4 a1 = a[i];
says:
...
ld.f32 %f1, [%rd6];
ld.f32 %f2, [%rd6+4];
ld.f32 %f3, [%rd6+8];
ld.f32 %f4, [%rd6+12];
st.f32 [%SP+12], %f4;
st.f32 [%SP+8], %f3;
st.f32 [%SP+4], %f2;
st.f32 [%SP+0], %f1;
...
The SASS objdump says:
/*0108*/ MOV R10, R0; /* 0x2800000000029de4 */
/*0110*/ ISET.LT.AND R11, R0, RZ, PT; /* 0x108e0000fc02dc23 */
/*0118*/ MOV32I R13, 0x4; /* 0x1800000010035de2 */
/*0120*/ ISETP.LE.U32.AND P0, PT, R13, 0x20, PT; /* 0x198ec00080d1dc03 */
/*0128*/ ISUB R12, 0x20, R13; /* 0x4800c00080d31e03 */
/*0130*/ SHL R11, R11, R13; /* 0x6000000034b2dc03 */
/*0138*/ SHR.U32 R14, R10, R12; /* 0x5800000030a39c03 */
/* 0x22c2804282328047 */
/*0148*/ IADD R11, R11, R14; /* 0x4800000038b2dc03 */
/*0150*/ #!P0 IADD R12, R13, -0x20; /* 0x4800ffff80d32003 */
/*0158*/ #!P0 SHL R11, R10, R12; /* 0x6000000030a2e003 */
/*0160*/ SHL R10, R10, R13; /* 0x6000000034a29c03 */
/*0168*/ MOV R10, R10; /* 0x2800000028029de4 */
/*0170*/ MOV R11, R11; /* 0x280000002c02dde4 */
/*0178*/ IADD R8.CC, R8, R10; /* 0x4801000028821c03 */
/* 0x228042c042828047 */
/*0188*/ IADD.X R9, R9, R11; /* 0x480000002c925c43 */
/*0190*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0198*/ MOV R9, R9; /* 0x2800000024025de4 */
/*01a0*/ LD.E R10, [R8]; /* 0x8400000000829c85 */
/*01a8*/ IADD R12.CC, R8, 0x4; /* 0x4801c00010831c03 */
/*01b0*/ IADD.X R13, R9, RZ; /* 0x48000000fc935c43 */
/*01b8*/ MOV R12, R12; /* 0x2800000030031de4 */
/* 0x2202828042c2e287 */
/*01c8*/ MOV R13, R13; /* 0x2800000034035de4 */
/*01d0*/ LD.E R11, [R12]; /* 0x8400000000c2dc85 */
/*01d8*/ IADD R12.CC, R8, 0x8; /* 0x4801c00020831c03 */
/*01e0*/ IADD.X R13, R9, RZ; /* 0x48000000fc935c43 */
/*01e8*/ MOV R12, R12; /* 0x2800000030031de4 */
/*01f0*/ MOV R13, R13; /* 0x2800000034035de4 */
/*01f8*/ LD.E R12, [R12]; /* 0x8400000000c31c85 */
/* 0x2282c202828042c7 */
/*0208*/ IADD R8.CC, R8, 0xc; /* 0x4801c00030821c03 */
/*0210*/ IADD.X R9, R9, RZ; /* 0x48000000fc925c43 */
/*0218*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0220*/ MOV R9, R9; /* 0x2800000024025de4 */
/*0228*/ LD.E R8, [R8]; /* 0x8400000000821c85 */
/*0230*/ IADD R14.CC, R2, 0xc; /* 0x4801c00030239c03 */
/*0238*/ IADD.X R15, R3, RZ; /* 0x48000000fc33dc43 */
/* 0x22828042c2e28047 */
/*0248*/ MOV R14, R14; /* 0x2800000038039de4 */
/*0250*/ MOV R15, R15; /* 0x280000003c03dde4 */
/*0258*/ ST.E [R14], R8; /* 0x9400000000e21c85 */
/*0260*/ IADD R8.CC, R2, 0x8; /* 0x4801c00020221c03 */
/*0268*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*0270*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0278*/ MOV R9, R9; /* 0x2800000024025de4 */
/* 0x22c2e2828042c2e7 */
/*0288*/ ST.E [R8], R12; /* 0x9400000000831c85 */
/*0290*/ IADD R8.CC, R2, 0x4; /* 0x4801c00010221c03 */
/*0298*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*02a0*/ MOV R8, R8; /* 0x2800000020021de4 */
/*02a8*/ MOV R9, R9; /* 0x2800000024025de4 */
/*02b0*/ ST.E [R8], R11; /* 0x940000000082dc85 */
/*02b8*/ IADD R8.CC, R2, RZ; /* 0x48010000fc221c03 */
/* 0x22820042e2828047 */
/*02c8*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*02d0*/ MOV R8, R8; /* 0x2800000020021de4 */
/*02d8*/ MOV R9, R9; /* 0x2800000024025de4 */
/*02e0*/ ST.E [R8], R10; /* 0x9400000000829c85 */
Here is the rest:
void CudaTest()
{
int size = 8192;
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
float *host_a = (float*)malloc(4 * size * sizeof(float));
float *host_b = (float*)malloc(4 * size * sizeof(float));
float *host_c = (float*)malloc(4 * size * sizeof(float));
float4 *dev_a4 = 0;
float4 *dev_b4 = 0;
float4 *dev_c4 = 0;
float4 *host_a4 = (float4*)malloc(size * sizeof(float4));
float4 *host_b4 = (float4*)malloc(size * sizeof(float4));
float4 *host_c4 = (float4*)malloc(size * sizeof(float4));
for (int i = 0; i < 4 * size; i++)
{
host_a[i] = rand() / RAND_MAX;
host_b[i] = rand() / RAND_MAX;
}
for (int i = 0; i < size; i++)
{
host_a4[i].x = rand() / RAND_MAX;
host_a4[i].y = rand() / RAND_MAX;
host_a4[i].z = rand() / RAND_MAX;
host_a4[i].w = rand() / RAND_MAX;
host_b4[i].x = rand() / RAND_MAX;
host_b4[i].y = rand() / RAND_MAX;
host_b4[i].z = rand() / RAND_MAX;
host_b4[i].w = rand() / RAND_MAX;
}
// Choose which GPU to run on, change this on a multi-GPU system.
CUDA_CALL(cudaSetDevice(0));
// Allocate GPU buffers for three vectors (two input, one output) .
CUDA_CALL(cudaMalloc((void**)&dev_c, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_a, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_b, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_c4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_a4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_b4, size * sizeof(float4)));
// Copy input vectors from host memory to GPU buffers.
CUDA_CALL(cudaMemcpy(dev_a, host_a, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b, host_b, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_a4, host_a4, size * sizeof(float4), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b4, host_b4, size * sizeof(float4), cudaMemcpyHostToDevice));
int local = 256;
int N = size / local;
// Launch a kernel on the GPU with one thread for each element.
add_float << <4*N, local >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());
add_float4 << <N, local >> >(dev_c4, dev_a4, dev_b4);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
CUDA_CALL(cudaDeviceSynchronize());
// Copy output vector from GPU buffer to host memory.
CUDA_CALL(cudaMemcpy(host_c, dev_c, 4 * size * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(host_c4, dev_c4, size * sizeof(float4), cudaMemcpyDeviceToHost));
}

Use of vector load/store instructions provided by the GPU hardware is considered a performance optimization applied by the compiler, as the code is fully functional using scalar loads and stores. When code is compiled by nvcc with -G (usually for debugging), all optimizations including the vectorization of loads and stores are turned off.
To check for load/store vectorization, it is important to look at the actual machine code (SASS) that is being executed, rather than PTX, which is merely an intermediate code that gets compiled into SASS by an optimizing compiler component called ptxas which is invoked by the driver program nvcc. Run cuobjdump --dump-sass on the executable produced by nvcc to inspect the machine code.

Related

Bare metal audio output on Raspberry Pi3 working in AARCH64 asm but not the C version

I have been trying to write a bare metal kernel for over a year now and I am up to the point where I am ready to start working on audio output. I have written the code in asm however since I'm not great at it I'm not sure how I can pass audio samples as arguments to a asm function. I tried to rewrite it in C however it isn't working. This problem is really a spot the difference. I know my asm version works but the audio sample is written into the play_audio function. My goal is to have a init function for the audio with no arguments and a play_audio function that takes the pointer to the start of the audio function and a pointer to the end of the audio file. The audio file to be played is a 16 bit unsigned int pcm file. The same file that I'm trying to use for the C audio part is used successfully in the asm version. Since I set the hardware pwm to expect 13bit audio at 41400Hz there is a shift to convert the sample from 16bit to 13 bit so this isn't a mistake.
Not_working_audio.c
void init_audio_jack_c()//ERROR IN HERE
{
//Set phone jack to pwm output
uint32_t *gpio_addr = (uint32_t *)(PERIPHERAL_BASE + GPIO_BASE);
uint32_t *gpio_gpfsel4_addr = gpio_addr + GPIO_GPFSEL4;
*gpio_gpfsel4_addr = GPIO_FSEL0_ALT0 | GPIO_FSEL5_ALT0;
//Set clock
uint32_t *clock_manager_addr = (uint32_t *)(((PERIPHERAL_BASE + CM_BASE) & 0x0000FFFF) | ((PERIPHERAL_BASE + CM_BASE) & 0xFFFF0000));
*(clock_manager_addr + CM_PWMDIV) = (CM_PASSWORD | 0x2000);
*(clock_manager_addr + CM_PWMCTL) = ((CM_PASSWORD | CM_ENAB) | (CM_SRC_OSCILLATOR + CM_SRC_PLLCPER));
//Set PWM
uint32_t *pwm_manager_addr = (uint32_t *)(((PERIPHERAL_BASE + PWM_BASE) & 0x0000FFFF) | ((PERIPHERAL_BASE + PWM_BASE) & 0xFFFF0000));
*(pwm_manager_addr + PWM_RNG1) = 0x1624;
*(pwm_manager_addr + PWM_RNG2) = 0x1624;
*(pwm_manager_addr + PWM_CTL) = PWM_USEF2 + PWM_PWEN2 + PWM_USEF1 + PWM_PWEN1 + PWM_CLRF1;
printf("[INFO] Audio Init Finished");
}
int32_t play_16bit_unsigned_audio(uint16_t *start, uint16_t *end)
{
if(end < start)
{
printf("[ERROR] End is less than start.");
return 1;
}
if((start - end) % 2 == 0)
{
printf("[ERROR] Isn't a multiple of two so it isn't 16bit");
return 2;
}
uint16_t *end_of_file = (uint16_t *)(uint64_t)(((uint32_t)(uintptr_t)end & 0x0000FFFF) | ((uint32_t)(uintptr_t)end & 0xFFFF0000));
//FIFO write
while(start != end_of_file)
{
uint16_t sample = start[0];
sample >>= 3;
*(uint32_t *)((((uint32_t)(PERIPHERAL_BASE + PWM_BASE) & 0x0000FFFF) | ((uint32_t)(PERIPHERAL_BASE + PWM_BASE) & 0xFFFF0000)) + PWM_FIF1) = sample;
start++;
sample = start[0];
sample >>= 3;
*(uint32_t *)((((uint32_t)(PERIPHERAL_BASE + PWM_BASE) & 0x0000FFFF) | ((uint32_t)(PERIPHERAL_BASE + PWM_BASE) & 0xFFFF0000)) + PWM_FIF1) = sample;
//FIFO wait
while(*(uint32_t *)((((uint32_t)(PERIPHERAL_BASE + PWM_BASE) & 0x0000FFFF) | ((uint32_t)(PERIPHERAL_BASE + PWM_BASE) & 0xFFFF0000)) + PWM_STA) != PWM_FULL1);
start++;
}
printf("[INFO] Completed Audio");
return 0;
}
Working_audio.s
.section .text.init_audio_jack, "ax", %progbits
.balign 4
.globl init_audio_jack;
.type init_audio_jack, %function
init_audio_jack:
mov w0,PERIPHERAL_BASE + GPIO_BASE
mov w1,GPIO_FSEL0_ALT0
orr w1,w1,GPIO_FSEL5_ALT0
str w1,[x0,GPIO_GPFSEL4]
// Set Clock
mov w0, PERIPHERAL_BASE
add w0, w0, CM_BASE
and w0, w0, 0x0000FFFF
mov w1, PERIPHERAL_BASE
add w1, w1, CM_BASE
and w1, w1, 0xFFFF0000
orr w0,w0,w1
mov w1,CM_PASSWORD
orr w1,w1,0x2000 // Bits 0..11 Fractional Part Of Divisor = 0, Bits 12..23 Integer Part Of Divisor = 2
brk #0
str w1,[x0,CM_PWMDIV]
mov w1,CM_PASSWORD
orr w1,w1,CM_ENAB
orr w1,w1,CM_SRC_OSCILLATOR + CM_SRC_PLLCPER // Use 650MHz PLLC Clock
str w1,[x0,CM_PWMCTL]
// Set PWM
mov w0, PERIPHERAL_BASE
add w0, w0, PWM_BASE
and w0, w0, 0x0000FFFF
mov w1,PERIPHERAL_BASE
add w1, w1, PWM_BASE
and w1, w1, 0xFFFF0000
orr w0,w0,w1
mov w1,0x1624 // Range = 13bit 44100Hz Mono
str w1,[x0,PWM_RNG1]
str w1,[x0,PWM_RNG2]
mov w1,PWM_USEF2 + PWM_PWEN2 + PWM_USEF1 + PWM_PWEN1 + PWM_CLRF1
str w1,[x0,PWM_CTL]
.section .text.play_audio, "ax", %progbits
.balign 4
.globl play_audio;
.type play_audio, %function
play_audio:
Loop:
adr x1, _binary_src_audio_Interlude_bin_start // X1 = Sound Sample
ldr w2, =_binary_src_audio_Interlude_bin_end
and w2, w2, 0x0000FFFF // W2 = End Of Sound Sample
ldr w3, =_binary_src_audio_Interlude_bin_end
and w3, w3, 0xFFFF0000
orr w2,w2,w3
FIFO_Write:
ldrh w3,[x1],2 // Write 2 Bytes To FIFO
lsr w3,w3,3 // Convert 16bit To 13bit
str w3,[x0,PWM_FIF1] // FIFO Address
ldrh w3, [x1], 2
lsr w3, w3, 3
str w3, [x0, PWM_FIF1]
FIFO_Wait:
ldr w3,[x0,PWM_STA]
tst w3,PWM_FULL1 // Test Bit 1 FIFO Full
b.ne FIFO_Wait
cmp w1,w2 // Check End Of Sound Sample
b.ne FIFO_Write
b Loop // Play Sample Again
Thanks in advance to anyone that can help!

Can I ensure that NVCC has managed to place an array in registers?

A CUDA kernel with some local, fixed-size array may get compiled so that the array resides in the thread's "local memory", or - if NVCC can determine the position of each array access at compile time, and there are enough registers available - the array might be broken up with its elements residing in registers.
Is it possible to check or to ensure, either via the code or as part of the build process, that a specific array, or all local arrays in a kernel, have been fit into registers? Is doing so supported by any tool?
At runtime
You may use the CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES as a hint on whether your array has been registrified; using CUDA driver API function cuFuncGetAttribute. But for some use cases, runtime may be too late.
At compile time
You want to have a look at the generated ptx file (using --keep option in nvcc).
The local data declation is identified as .local in the ptx. Here is a small example, with a kernel.
#define ww 65
__global__ void kernel(int W, int H, const int *a, int *b)
{
int buffer[ww];
for (int i = threadIdx.x; i < H; i += blockDim.x)
{
#pragma unroll
for (int w = 0; w < ww; ++w)
buffer[w] = a[i + w * W];
for (int j = 5; j < H - 5; ++j)
{
buffer[j % ww] = a[i + (j + 6) * W];
int s = 0;
#pragma unroll
for (int w = 0; w < ww; ++w)
s += buffer[w];
b[i + (j + 6) * W] = s;
}
}
}
When compiled there is a declaration of a local variable:
.visible .entry _Z6kerneliiPKiPi(
.param .u32 _Z6kerneliiPKiPi_param_0,
.param .u32 _Z6kerneliiPKiPi_param_1,
.param .u64 _Z6kerneliiPKiPi_param_2,
.param .u64 _Z6kerneliiPKiPi_param_3
)
{
.local .align 4 .b8 __local_depot0[260];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b32 %r<219>;
.reg .b64 %rd<81>;
However, when rolling the buffer, buffer is always accessed with known indices and registers may be obtained - no local storage:
#define ww 65
__global__ void kernel(int W, int H, const int *a, int *b)
{
int buffer[ww];
for (int i = threadIdx.x; i < H; i += blockDim.x)
{
#pragma unroll
for (int w = 0; w < ww; ++w)
buffer[w] = a[i + w * W];
for (int j = 5; j < H - 5; ++j)
{
#pragma unroll
for (int w = 0; w < ww-1; ++w)
buffer[w] = buffer[w + 1];
buffer[ww - 1] = a[i + (j + 6) * W];
int s = 0;
#pragma unroll
for (int w = 0; w < ww; ++w)
s += buffer[w];
b[i + (j + 6) * W] = s;
}
}
}
Yields the following ptx:
.visible .entry _Z6kerneliiPKiPi(
.param .u32 _Z6kerneliiPKiPi_param_0,
.param .u32 _Z6kerneliiPKiPi_param_1,
.param .u64 _Z6kerneliiPKiPi_param_2,
.param .u64 _Z6kerneliiPKiPi_param_3
)
{
.reg .pred %p<5>;
.reg .b32 %r<393>;
.reg .b64 %rd<240>;
Note that depending on the number of registers available, the number of required registers may not fit. These are virtual registers (which has somehow changed in recent versions of CUDA). Meaning that the absence of .local .align 4 .b8 __local_depot is a prerequisite, but not sufficient.
You need to look at the SASS then. Using nvdisasm on your generated .cubin, you want to search for STL instruction which stands for STore Local, as described briefly here. Here are parts of the two disassembled cubins compiled with two different values of --maxrregcount compiler switch - first for 32 (see the many occurrences of STL):
//--------------------- .text._Z6kerneliiPKiPi --------------------------
.section .text._Z6kerneliiPKiPi,"ax",#progbits
.sectioninfo #"SHI_REGISTERS=32"
.align 32
.global _Z6kerneliiPKiPi
.type _Z6kerneliiPKiPi,#function
.size _Z6kerneliiPKiPi,(.L_25 - _Z6kerneliiPKiPi)
.other _Z6kerneliiPKiPi,#"STO_CUDA_ENTRY STV_DEFAULT"
_Z6kerneliiPKiPi:
.text._Z6kerneliiPKiPi:
/*0008*/ MOV R1, c[0x0][0x20];
/*0010*/ { IADD32I R1, R1, -0x180;
/*0018*/ S2R R0, SR_TID.X; }
/*0028*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x144], PT;
/*0030*/ NOP;
/*0038*/ NOP;
/*0048*/ #P0 EXIT;
.L_3:
/*0050*/ IADD R2, R0, c[0x0][0x140];
/*0058*/ MOV R30, c[0x0][0x140];
/*0068*/ ISCADD R5.CC, R2.reuse, c[0x0][0x148], 0x2;
/*0070*/ { SHR R3, R2, 0x1e;
/*0078*/ STL [R1+0x14], R5; }
/*0088*/ ISCADD R2, R30.reuse, R0.reuse, 0x1;
/*0090*/ ISCADD R4, R30.reuse, R0.reuse, 0x2;
/*0098*/ ISCADD R20, R30, R0, 0x3;
/*00a8*/ IADD.X R5, R3, c[0x0][0x14c];
/*00b0*/ { SHR R3, R2.reuse, 0x1e;
/*00b8*/ STL [R1+0x10], R5; }
/*00c8*/ ISCADD R2.CC, R2, c[0x0][0x148], 0x2;
/*00d0*/ STL [R1+0x8], R2;
/*00d8*/ SHR R5, R4, 0x1e;
/*00e8*/ IADD.X R2, R3, c[0x0][0x14c];
/*00f0*/ { ISCADD R4.CC, R4, c[0x0][0x148], 0x2;
/*00f8*/ STL [R1+0x4], R2; }
Then for 255 - no occurence of STL:
//--------------------- .text._Z6kerneliiPKiPi --------------------------
.section .text._Z6kerneliiPKiPi,"ax",#progbits
.sectioninfo #"SHI_REGISTERS=124"
.align 32
.global _Z6kerneliiPKiPi
.type _Z6kerneliiPKiPi,#function
.size _Z6kerneliiPKiPi,(.L_25 - _Z6kerneliiPKiPi)
.other _Z6kerneliiPKiPi,#"STO_CUDA_ENTRY STV_DEFAULT"
_Z6kerneliiPKiPi:
.text._Z6kerneliiPKiPi:
/*0008*/ MOV R1, c[0x0][0x20];
/*0010*/ S2R R0, SR_TID.X;
/*0018*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x144], PT;
/*0028*/ NOP;
/*0030*/ NOP;
/*0038*/ #P0 EXIT;
/*0048*/ MOV R46, c[0x0][0x144];
/*0050*/ IADD R47, RZ, -c[0x0][0x140];
/*0058*/ IADD32I R46, R46, -0x5;
/*0068*/ SHL R47, R47, 0x2;
.L_3:
/*0070*/ ISETP.LT.AND P0, PT, R46, 0x6, PT;
/*0078*/ #P0 BRA `(.L_1);
/*0088*/ MOV R2, c[0x0][0x140];
/*0090*/ ISCADD R2, R2, R0, 0x6;
/*0098*/ SHR R27, R2.reuse, 0x1e;
/*00a8*/ ISCADD R26.CC, R2, c[0x0][0x148], 0x2;
/*00b0*/ SHR R48, R47, 0x1f;
/*00b8*/ IADD.X R27, R27, c[0x0][0x14c];
/*00c8*/ { IADD R44.CC, R47.reuse, R26;
/*00d0*/ LDG.E R49, [R26]; }
/*00d8*/ IADD.X R45, R48.reuse, R27;
/*00e8*/ { IADD R42.CC, R47.reuse, R44 SLOT 0;
/*00f0*/ LDG.E R44, [R44] SLOT 1; }
/*00f8*/ IADD.X R43, R48.reuse, R45;
/*0108*/ { IADD R38.CC, R47, R42 SLOT 0;
/*0110*/ LDG.E R42, [R42] SLOT 1; }
Very much like you I assume, I wish that all of this was better documented.

Efficiency of CUDA vector types (float2, float3, float4)

I'm trying to understand the integrate_functor in particles_kernel.cu from CUDA examples:
struct integrate_functor
{
float deltaTime;
//constructor for functor
//...
template <typename Tuple>
__device__
void operator()(Tuple t)
{
volatile float4 posData = thrust::get<2>(t);
volatile float4 velData = thrust::get<3>(t);
float3 pos = make_float3(posData.x, posData.y, posData.z);
float3 vel = make_float3(velData.x, velData.y, velData.z);
// update position and velocity
// ...
// store new position and velocity
thrust::get<0>(t) = make_float4(pos, posData.w);
thrust::get<1>(t) = make_float4(vel, velData.w);
}
};
We call make_float4(pos, age) but make_float4 is defined in vector_functions.h as
static __inline__ __host__ __device__ float4 make_float4(float x, float y, float z, float w)
{
float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
Are CUDA vector types (float3 and float4) more efficient for the GPU and how does the compiler know how to overload the function make_float4?
I'm expanding njuffa's comment into a worked example. In that example, I'm simply adding two arrays in three different ways: loading the data as float, float2 or float4.
These are the timings on a GT540M and on a Kepler K20c card:
GT540M
float - Elapsed time: 74.1 ms
float2 - Elapsed time: 61.0 ms
float4 - Elapsed time: 56.1 ms
Kepler K20c
float - Elapsed time: 4.4 ms
float2 - Elapsed time: 3.3 ms
float4 - Elapsed time: 3.2 ms
As it can be seen, loading the data as float4 is the fastest approach.
Below are the disassembled codes for the three kernels (compilation for compute capability 2.1).
add_float
Function : _Z9add_floatPfS_S_j
.headerflags #"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0010*/ SHL R2, R2, 0x2; /* 0x6000c00008209c03 */
/*0018*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0020*/ SHL R0, R0, 0x2; /* 0x6000c00008001c03 */
/*0028*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0030*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc03 */
/*0038*/ #P0 BRA.U 0xd8; /* 0x40000002600081e7 */
/*0040*/ #!P0 ISCADD R2, R0, c[0x0][0x24], 0x2; /* 0x400040009000a043 */
/*0048*/ #!P0 ISCADD R10, R0, c[0x0][0x20], 0x2; /* 0x400040008002a043 */
/*0050*/ #!P0 ISCADD R0, R0, c[0x0][0x28], 0x2; /* 0x40004000a0002043 */
/*0058*/ #!P0 LD R8, [R2]; /* 0x8000000000222085 */
/*0060*/ #!P0 LD R6, [R2+0x4]; /* 0x800000001021a085 */
/*0068*/ #!P0 LD R4, [R2+0x8]; /* 0x8000000020212085 */
/*0070*/ #!P0 LD R9, [R10]; /* 0x8000000000a26085 */
/*0078*/ #!P0 LD R7, [R10+0x4]; /* 0x8000000010a1e085 */
/*0080*/ #!P0 LD R5, [R10+0x8]; /* 0x8000000020a16085 */
/*0088*/ #!P0 LD R3, [R10+0xc]; /* 0x8000000030a0e085 */
/*0090*/ #!P0 LD R2, [R2+0xc]; /* 0x800000003020a085 */
/*0098*/ #!P0 FADD R8, R9, R8; /* 0x5000000020922000 */
/*00a0*/ #!P0 FADD R6, R7, R6; /* 0x500000001871a000 */
/*00a8*/ #!P0 FADD R4, R5, R4; /* 0x5000000010512000 */
/*00b0*/ #!P0 ST [R0], R8; /* 0x9000000000022085 */
/*00b8*/ #!P0 FADD R2, R3, R2; /* 0x500000000830a000 */
/*00c0*/ #!P0 ST [R0+0x4], R6; /* 0x900000001001a085 */
/*00c8*/ #!P0 ST [R0+0x8], R4; /* 0x9000000020012085 */
/*00d0*/ #!P0 ST [R0+0xc], R2; /* 0x900000003000a085 */
/*00d8*/ EXIT; /* 0x8000000000001de7 */
add_float2
Function : _Z10add_float2P6float2S0_S0_j
.headerflags #"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0010*/ SHL R2, R2, 0x1; /* 0x6000c00004209c03 */
/*0018*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0020*/ SHL R0, R0, 0x1; /* 0x6000c00004001c03 */
/*0028*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0030*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc03 */
/*0038*/ #P0 BRA.U 0xa8; /* 0x40000001a00081e7 */
/*0040*/ #!P0 ISCADD R10, R0, c[0x0][0x20], 0x3; /* 0x400040008002a063 */
/*0048*/ #!P0 ISCADD R11, R0, c[0x0][0x24], 0x3; /* 0x400040009002e063 */
/*0050*/ #!P0 ISCADD R0, R0, c[0x0][0x28], 0x3; /* 0x40004000a0002063 */
/*0058*/ #!P0 LD.64 R4, [R10]; /* 0x8000000000a120a5 */
/*0060*/ #!P0 LD.64 R8, [R11]; /* 0x8000000000b220a5 */
/*0068*/ #!P0 LD.64 R2, [R10+0x8]; /* 0x8000000020a0a0a5 */
/*0070*/ #!P0 LD.64 R6, [R11+0x8]; /* 0x8000000020b1a0a5 */
/*0078*/ #!P0 FADD R9, R5, R9; /* 0x5000000024526000 */
/*0080*/ #!P0 FADD R8, R4, R8; /* 0x5000000020422000 */
/*0088*/ #!P0 FADD R3, R3, R7; /* 0x500000001c30e000 */
/*0090*/ #!P0 FADD R2, R2, R6; /* 0x500000001820a000 */
/*0098*/ #!P0 ST.64 [R0], R8; /* 0x90000000000220a5 */
/*00a0*/ #!P0 ST.64 [R0+0x8], R2; /* 0x900000002000a0a5 */
/*00a8*/ EXIT; /* 0x8000000000001de7 */
add_float4
Function : _Z10add_float4P6float4S0_S0_j
.headerflags #"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ NOP; /* 0x4000000000001de4 */
/*0010*/ MOV R3, c[0x0][0x2c]; /* 0x28004000b000dde4 */
/*0018*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0020*/ SHR.U32 R3, R3, 0x2; /* 0x5800c0000830dc03 */
/*0028*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0030*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0038*/ ISETP.GE.U32.AND P0, PT, R0, R3, PT; /* 0x1b0e00000c01dc03 */
/*0040*/ #P0 BRA.U 0x98; /* 0x40000001400081e7 */
/*0048*/ #!P0 ISCADD R2, R0, c[0x0][0x20], 0x4; /* 0x400040008000a083 */
/*0050*/ #!P0 ISCADD R3, R0, c[0x0][0x24], 0x4; /* 0x400040009000e083 */
/*0058*/ #!P0 ISCADD R0, R0, c[0x0][0x28], 0x4; /* 0x40004000a0002083 */
/*0060*/ #!P0 LD.128 R8, [R2]; /* 0x80000000002220c5 */
/*0068*/ #!P0 LD.128 R4, [R3]; /* 0x80000000003120c5 */
/*0070*/ #!P0 FADD R7, R11, R7; /* 0x500000001cb1e000 */
/*0078*/ #!P0 FADD R6, R10, R6; /* 0x5000000018a1a000 */
/*0080*/ #!P0 FADD R5, R9, R5; /* 0x5000000014916000 */
/*0088*/ #!P0 FADD R4, R8, R4; /* 0x5000000010812000 */
/*0090*/ #!P0 ST.128 [R0], R4; /* 0x90000000000120c5 */
/*0098*/ EXIT; /* 0x8000000000001de7 */
As it can be seen and as mentioned by njuffa, different load instructions are used for the three cases: LD, LD.64 and LD.128, respectively.
Finally, the code:
#include <thrust/device_vector.h>
#define BLOCKSIZE 256
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********************/
/* ADD_FLOAT KERNEL */
/********************/
__global__ void add_float(float *d_a, float *d_b, float *d_c, unsigned int N) {
const int tid = 4 * threadIdx.x + blockIdx.x * (4 * blockDim.x);
if (tid < N) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float a2 = d_a[tid+1];
float b2 = d_b[tid+1];
float a3 = d_a[tid+2];
float b3 = d_b[tid+2];
float a4 = d_a[tid+3];
float b4 = d_b[tid+3];
float c1 = a1 + b1;
float c2 = a2 + b2;
float c3 = a3 + b3;
float c4 = a4 + b4;
d_c[tid] = c1;
d_c[tid+1] = c2;
d_c[tid+2] = c3;
d_c[tid+3] = c4;
//if ((tid < 1800) && (tid > 1790)) {
//printf("%i %i %i %f %f %f\n", tid, threadIdx.x, blockIdx.x, a1, b1, c1);
//printf("%i %i %i %f %f %f\n", tid+1, threadIdx.x, blockIdx.x, a2, b2, c2);
//printf("%i %i %i %f %f %f\n", tid+2, threadIdx.x, blockIdx.x, a3, b3, c3);
//printf("%i %i %i %f %f %f\n", tid+3, threadIdx.x, blockIdx.x, a4, b4, c4);
//}
}
}
/*********************/
/* ADD_FLOAT2 KERNEL */
/*********************/
__global__ void add_float2(float2 *d_a, float2 *d_b, float2 *d_c, unsigned int N) {
const int tid = 2 * threadIdx.x + blockIdx.x * (2 * blockDim.x);
if (tid < N) {
float2 a1 = d_a[tid];
float2 b1 = d_b[tid];
float2 a2 = d_a[tid+1];
float2 b2 = d_b[tid+1];
float2 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
float2 c2;
c2.x = a2.x + b2.x;
c2.y = a2.y + b2.y;
d_c[tid] = c1;
d_c[tid+1] = c2;
}
}
/*********************/
/* ADD_FLOAT4 KERNEL */
/*********************/
__global__ void add_float4(float4 *d_a, float4 *d_b, float4 *d_c, unsigned int N) {
const int tid = 1 * threadIdx.x + blockIdx.x * (1 * blockDim.x);
if (tid < N/4) {
float4 a1 = d_a[tid];
float4 b1 = d_b[tid];
float4 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
c1.z = a1.z + b1.z;
c1.w = a1.w + b1.w;
d_c[tid] = c1;
}
}
/********/
/* MAIN */
/********/
int main() {
const int N = 4*10000000;
const float a = 3.f;
const float b = 5.f;
// --- float
thrust::device_vector<float> d_A(N, a);
thrust::device_vector<float> d_B(N, b);
thrust::device_vector<float> d_C(N);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add_float<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_A.data()), thrust::raw_pointer_cast(d_B.data()), thrust::raw_pointer_cast(d_C.data()), N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<float> h_float = d_C;
for (int i=0; i<N; i++) {
if (h_float[i] != (a+b)) {
printf("Error for add_float at %i: result is %f\n",i, h_float[i]);
return -1;
}
}
// --- float2
thrust::device_vector<float> d_A2(N, a);
thrust::device_vector<float> d_B2(N, b);
thrust::device_vector<float> d_C2(N);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add_float2<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>((float2*)thrust::raw_pointer_cast(d_A2.data()), (float2*)thrust::raw_pointer_cast(d_B2.data()), (float2*)thrust::raw_pointer_cast(d_C2.data()), N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<float> h_float2 = d_C2;
for (int i=0; i<N; i++) {
if (h_float2[i] != (a+b)) {
printf("Error for add_float2 at %i: result is %f\n",i, h_float2[i]);
return -1;
}
}
// --- float4
thrust::device_vector<float> d_A4(N, a);
thrust::device_vector<float> d_B4(N, b);
thrust::device_vector<float> d_C4(N);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add_float4<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>((float4*)thrust::raw_pointer_cast(d_A4.data()), (float4*)thrust::raw_pointer_cast(d_B4.data()), (float4*)thrust::raw_pointer_cast(d_C4.data()), N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<float> h_float4 = d_C4;
for (int i=0; i<N; i++) {
if (h_float4[i] != (a+b)) {
printf("Error for add_float4 at %i: result is %f\n",i, h_float4[i]);
return -1;
}
}
return 0;
}

Find minimum and maximum value of an array using ARM NEON instructions

I have the following code which I would like to optimise using ARM NEON instructions. How can I implement it?
Thanks for the answers
unsigned char someVector[] = {1, 2, 4, 1, 2, 0, 8, 100};
unsigned char maxVal = 0, minVal = 255;
for (int i = 0; i < sizeof(someVector); i++)
{
if (someVector[i] < minVal)
{
minVal = someVector[i];
}
else if (someVector[i] > maxVal)
{
maxVal = someVector[i];
}
}
Below is an highly optimized example how to find min and max in a large array. The function simply returns if size is smaller than 128 :
/*
* minmax.S
*
* Created on: 2014. 10. 29.
* Author: Jake Lee
*/
// unsigned int minmax(unsigned char *pSrc, unsigned int size);
.text
.arm
.global minmax
pSrc .req r0
size .req r1
qmin1 .req q0
dmina .req d0
dminb .req d1
qmax1 .req q1
dmaxa .req d2
dmaxb .req d3
qmin2 .req q2
qmax2 .req q3
.align 5
.func
minmax:
subs size, size, #128
bxmi lr
vmov.i8 qmin1, #0xff
vmov.i8 qmax1, #0
vmov.i8 qmin2, #0xff
vmov.i8 qmax2, #0
.align 5
1:
vld1.8 {q8, q9}, [pSrc]!
vld1.8 {q10, q11}, [pSrc]!
vld1.8 {q12, q13}, [pSrc]!
vld1.8 {q14, q15}, [pSrc]!
subs size, size, #128
pld [pSrc, #64*3]
pld [pSrc, #64*4]
vmin.u8 qmin1, q8
vmax.u8 qmax1, q8
vmin.u8 qmin2, q9
vmax.u8 qmax2, q9
vmin.u8 qmin1, q10
vmax.u8 qmax1, q10
vmin.u8 qmin2, q11
vmax.u8 qmax2, q11
vmin.u8 qmin1, q12
vmax.u8 qmax1, q12
vmin.u8 qmin2, q13
vmax.u8 qmax2, q13
vmin.u8 qmin1, q14
vmax.u8 qmax1, q14
vmin.u8 qmin2, q15
vmax.u8 qmax2, q15
bpl 1b
// deal width residuals (size % 128)
cmp size, #-128
addgt pSrc, pSrc, size
bgt 1b
// shrink to sixteen
vmin.u8 qmin1, qmin2
vmax.u8 qmax1, qmax2
// shrink to eight
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
// shrink to four
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
// shrink to two
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
// shrink to one
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
vmov r0, dmina[0]
vmov r1, dmaxa[0]
and r0, r0, #0xff
and r1, r1, #0xff
orr r0, r0, r1, lsl #16
bx lr
.endfunc
.end
The return value is an unsigned int. The lower 16 bits contain min and higher ones max :
result = minmax(pSrc, size);
min = result & 0xff;
max = result >> 16;
GCC will auto-vectorize this, with only small modifications.
unsigned char someVector[256] = { 1, 2, 4, 1, 2, 0, 8, 100 };
unsigned char maxVal = 0, minVal = 255;
void f(void)
{
unsigned char mn = 255, mx = 0;
for (int i = 0; i < sizeof(someVector); i++) {
if (someVector[i] < mn) {
mn = someVector[i];
}
if (someVector[i] > mx) {
mx = someVector[i];
}
}
maxVal = mx;
minVal = mn;
}
compile with
$ arm-unknown-linux-gnueabihf-gcc -O3 -std=c11 -mfpu=neon -c test.c
or
$ arm-unknown-linux-gnueabihf-gcc -O2 -ftree-vectorize -std=c11 -mfpu=neon -c test.c
You can do better than GCC if you write NEON intrinsics or assembler.

Cuda kernel not launched when creating too many doubles in __device__ function

I am using Nvidia Nsight to debug my code and I noticed that my distance kernel is sometimes not launching. The code attempts to find the distance between two polygons. It does this by going through each segment of each polygon and finding the distance between the two segments. The following is my code:
__device__ double point_segment_distance(double px, double py, double x1, double y1, double x2, double y2)
{
double dx = x2 - x1;
double dy = y2 - y1;
//
//if (dx < 0.01 && dy < 0.01)
//{
// return hypot(px - x1, py - y1);
//}
//double t = ((px - x1) * dx + (py - y1) * dy) / (dx * dx + dy * dy);
//if (t < 0)
//{
// dx = px - x1;
// dy = py - y1;
//}
//else if (t > 1)
//{
// dx = px - x2;
// dy = py - y2;
//}
//else
//{
// double near_x = x1 + t * dx;
// double near_y = y1 + t * dy;
// dx = px - near_x;
// dy = py - near_y;
//}
//return hypot(dx, dy);
return 10.0;
}
__device__ bool segments_intersect(double x11, double y11, double x12, double y12, double x21, double y21, double x22, double y22)
{
double dx1 = x12 - x11;
double dy1 = y12 - y11;
double dx2 = x22 - x21;
double dy2 = y22 - y21;
double delta = dx2 * dy1 - dy2 * dx1;
if (delta < 0.01)
{
return false;
}
double s = (dx1 * (y21 - y11) + dy1 * (x11 - x21)) / delta;
double t = (dx2 * (y11 - y21) + dy2 * (x21 - x11)) / (-delta);
return (0 <= s && s <= 1 && 0 <= t && t <= 1);
}
__device__ double segments_distance(double x11, double y11, double x12, double y12, double x21, double y21, double x22, double y22)
{
if (segments_intersect(x11, y11, x12, y12, x21, y21, x22, y22))
{
return 0.0;
}
double minimumDist = 999999;
double tempDist = point_segment_distance(x11, y11, x21, y21, x22, y22);
if (tempDist < minimumDist)
{
minimumDist = tempDist;
}
tempDist = point_segment_distance(x12, y12, x21, y21, x22, y22);
if (tempDist < minimumDist)
{
minimumDist = tempDist;
}
tempDist = point_segment_distance(x21, y21, x11, y11, x12, y12);
if (tempDist < minimumDist)
{
minimumDist = tempDist;
}
tempDist = point_segment_distance(x22, y22, x11, y11, x12, y12);
if (tempDist < minimumDist)
{
minimumDist = tempDist;
}
return minimumDist;
}
__global__ void distance(double *x0, double *y0, double *x1, double *y1, double *dist, int *length0, int *length1, int *numDone)
{
int numComp = threadIdx.x + blockDim.x*blockIdx.x + *numDone;
int index = threadIdx.x + blockDim.x*blockIdx.x;
dist[index] = 99999;
if (numComp < ((*length0)*(*length1)))
{
int spot0 = numComp%(*length0);
int spot1 = numComp/(*length0);
dist[index] = segments_distance(x0[spot0], y0[spot0], x0[(spot0+1)%(*length0)], y0[(spot0+1)%(*length0)], x1[spot1], y1[spot1], x1[(spot1+1)%(*length1)], y1[(spot1+1)%(*length1)]);
}
}
void gpuDistance(double *x0, double *y0, double *x1, double *y1)
{
...
distance<<<165, 1024>>>(dev_x0, dev_y0, dev_x1, dev_y1, dev_dist, dev_length0, dev_length1, dev_numDone);
...
}
I commented out much of point_segment_distance in order to help me locate the error. This will not launch the distance kernel. I know this because I am using Nsight Cuda Debugging and it doesn't hit my breakpoints.
However, if I comment the line "double dy = y2 - y1;" in point_segment_distance the distance kernel will launch. How is this possible? Why would creating one more double cause the kernel to not launch. Is there a limit to the number of doubles that may be created on the GPU. I have a Tesla 2075. I am aware of the local memory limit of 512kb. However, looking at my code I can't imagine that I'm anywhere near that limit. Thanks for any help!
Not hitting breakpoints does not mean that the kernel is not executed since the compiler has the freedom to perform aggressive optimizations on the code. To check the correctness of kernel launches, you should better perform canonical CUDA error checking in the sense of talonmies' post
What is the canonical way to check for errors using the CUDA runtime API?
To have an idea of the optimizations the compiler can perform, consider for example the following code
__global__ void point_segment_distance(double* distance_squared, const double* __restrict__ x1, const double* __restrict__ y1, const double* __restrict__ x2, const double* __restrict__ y2)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
double dx = x2[i] - x1[i];
double dy = y2[i] - y1[i];
//distance_squared[i] = dx*dx+dy*dy;
}
Note the commented instruction. When such an instruction is illustrated, then everything inside the kernel function becomes dead code, since it will not contribute to global memory data, and is eliminated by the compiler. Indeed, the disassembled code becomes
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ EXIT ; /* 0x8000000000001de7 */
When the above instruction is uncommented, then the compiler will produce
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R2, SR_CTAID.X; /* 0x2c00000094009c04 */
/*0010*/ S2R R3, SR_TID.X; /* 0x2c0000008400dc04 */
/*0018*/ MOV32I R0, 0x8; /* 0x1800000020001de2 */
/*0020*/ IMAD R18, R2, c[0x0][0x8], R3; /* 0x2006400020249ca3 */
/*0028*/ IMAD R8.CC, R18, R0, c[0x0][0x38]; /* 0x20018000e1221ca3 */
/*0030*/ IMAD.HI.X R9, R18, R0, c[0x0][0x3c]; /* 0x20808000f1225ce3 */
/*0038*/ IMAD R16.CC, R18, R0, c[0x0][0x40]; /* 0x2001800101241ca3 */
/*0040*/ LD.E.64 R10, [R8]; /* 0x8400000000829ca5 */
/*0048*/ IMAD.HI.X R17, R18, R0, c[0x0][0x44]; /* 0x2080800111245ce3 */
/*0050*/ IMAD R12.CC, R18, R0, c[0x0][0x30]; /* 0x20018000c1231ca3 */
/*0058*/ LD.E.64 R4, [R16]; /* 0x8400000001011ca5 */
/*0060*/ IMAD.HI.X R13, R18, R0, c[0x0][0x34]; /* 0x20808000d1235ce3 */
/*0068*/ IMAD R6.CC, R18, R0, c[0x0][0x28]; /* 0x20018000a1219ca3 */
/*0070*/ LD.E.64 R2, [R12]; /* 0x8400000000c09ca5 */
/*0078*/ IMAD.HI.X R7, R18, R0, c[0x0][0x2c]; /* 0x20808000b121dce3 */
/*0080*/ LD.E.64 R14, [R6]; /* 0x8400000000639ca5 */
/*0088*/ DADD R2, R4, -R2; /* 0x4800000008409d01 */
/*0090*/ DMUL R6, R2, R2; /* 0x5000000008219c01 */
/*0098*/ DADD R4, R10, -R14; /* 0x4800000038a11d01 */
/*00a0*/ IMAD R2.CC, R18, R0, c[0x0][0x20]; /* 0x2001800081209ca3 */
/*00a8*/ DFMA R4, R4, R4, R6; /* 0x200c000010411c01 */
/*00b0*/ IMAD.HI.X R3, R18, R0, c[0x0][0x24]; /* 0x208080009120dce3 */
/*00b8*/ ST.E.64 [R2], R4; /* 0x9400000000211ca5 */
/*00c0*/ EXIT ; /* 0x8000000000001de7 */
and the code is not dead anymore.

Resources