Question about address operations in neon intrinstics - arm

I'm rearranging an array in my project on ARMv7. Now I get the elements' address d[] in the order I expect. To make the code more efficient, I want to use neon intrinstics in C++. Now my problem is, I can load the address array d[] by using vld1q_s32(), but I do not know how to read the elements of this vector as addresses.The instructions I know can only simply duplicate one vector.
This problem has been confusing me for several days. Or neon cannot do certain thing?
Thanks for your answering.
Here is my code:
void InputRearrange(int8_t* din, int8_t* dout, const int x, const int y){
int8_t* dout_array[16];
int out = 0;
dout_array[0] = din;
for(int n = 1; n < 16; n++) {//get the address of the first line in z-axis
dout_array[n] = dout_array[n - 1] + x*y;
}
for(int y_count = 0; y_count < y; y_count++) {
for(int x_count = 0; x_count < x; x_count++) {
for(int z_count = 0; z_count < 16; z_count++) {
dout[out++] = *(dout_array[k]++);//dout_array[k]++ let dout_array[k] moves in x-axis and I want to change this loop into neon intrinsics.
}
}
}
}
}
din[ ] is the original array and is like a 3-D array as a cube but stored as a 1-D one. The cube has three axis: x, y , z(=16). The original way array din[ ] stores the elements from x-axis first and then y-axis and last z-axis. But my code changed the order to z-axis first and then x-axis and last y-axis. I would like to use neon intrinsics in the final for loop, but it seems that it cannot be realized.

Your code rearranges a three-dimensional array int8_t (&output)[y][x][16] to int8_t (&input)[16][y][x], which is also equivalent to transposing a 2d array of int8_t (&out)[x*y][16] to int8_t (&in)[16][x*y].
This can definitely benefit from arm neon intrinsics that can interleave/deinterleave either registers (vzip,vuzp) or memory content (vldN, vstN).
// planarizes next 128 bytes to 16 planes
void planarize(int8_t *in, int8_t *out, int xy) {
int8_t * o_1 = out + 4*xy;
int8_t * o_2 = out + 8*xy;
int8_t * o_3 = out + 12*xy;
int8x16x4_t a = vld4q_s8(in); in+=64;
int8x16x4_t b = vld4q_s8(in); in+=64;
int8x16x2_t c = vuzpq_s8(a.val[0], b.val[0]);
int8x16x2_t d = vuzpq_s8(a.val[1], b.val[1]);
int8x16x2_t e = vuzpq_s8(a.val[2], b.val[2]);
int8x16x2_t f = vuzpq_s8(a.val[3], b.val[3]);
c = vuzpq_s8(c.val[0], c.val[1]);
d = vuzpq_s8(d.val[0], d.val[1]);
e = vuzpq_s8(e.val[0], e.val[1]);
f = vuzpq_s8(f.val[0], f.val[1]);
// now c = 0 16 32 48 64 80 96 112 4 20 36 52 68 84 100 116
// 8 24 40 56 72 88 104 120 12 28 44 60 76 92 108 124
// d = c + 1, e = d + 1, f = e + 1
vst1_s8(out + 0 * xy, vget_low_s8(c.val[0]);
vst1_s8(out + 1 * xy, vget_low_s8(d.val[0]);
vst1_s8(out + 2 * xy, vget_low_s8(e.val[0]);
vst1_s8(out + 3 * xy, vget_low_s8(f.val[0]);
vst1_s8(o_1 + 4 * xy, vget_high_s8(c.val[0]);
vst1_s8(o_1 + 5 * xy, vget_high_s8(d.val[0]);
vst1_s8(o_1 + 6 * xy, vget_high_s8(e.val[0]);
vst1_s8(o_1 + 7 * xy, vget_high_s8(f.val[0]);
vst1_s8(o_2 + 0 * xy, vget_low_s8(c.val[1]);
vst1_s8(o_2 + 1 * xy, vget_low_s8(d.val[1]);
vst1_s8(o_2 + 2 * xy, vget_low_s8(e.val[1]);
vst1_s8(o_2 + 3 * xy, vget_low_s8(f.val[1]);
vst1_s8(o_3 + 4 * xy, vget_high_s8(c.val[1]);
vst1_s8(o_3 + 5 * xy, vget_high_s8(d.val[1]);
vst1_s8(o_3 + 6 * xy, vget_high_s8(e.val[1]);
vst1_s8(o_3 + 7 * xy, vget_high_s8(f.val[1]);
}
The opposite would interleave from 16 independent planes
int8x16x2_t load4(int8_t *in, int xy) {
int8x8_t a0 = vld1_s8(in);
int8x8_t a1 = vld1_s8(in + xy);
int8x8_t a2 = vld1_s8(in + 2 * xy);
int8x8_t a3 = vld1_s8(in + 3 * xy);
auto a = vzipq_s8(vcombine_s8(a0, a0), vcombine_s8(a1, a1)).val[0];
auto b = vzipq_s8(vcombine_s8(a2, a2), vcombine_s8(a3, a3)).val[0];
return vzipq_s8(a,b);
}
int8_t *store4(int8x16x2_t a, int8x16x2_t b, int8x16x2_t c, int8x16x2_t d, int8_t *out) {
int32x4x4_t A{
vreinterpretq_s32_s8(a.val[0]),
vreinterpretq_s32_s8(b.val[0]),
vreinterpretq_s32_s8(c.val[0]),
vreinterpretq_s32_s8(d.val[0])};
int32x4x4_t B{
vreinterpretq_s32_s8(a.val[1]),
vreinterpretq_s32_s8(b.val[1]),
vreinterpretq_s32_s8(c.val[1]),
vreinterpretq_s32_s8(d.val[1])};
vst4q_s32((int32_t*)out, A); out += 64;
vst4q_s32((int32_t*)out, B); out += 64;
return out;
}
void interleave(int8_t *in, int8_t *out, int xy) {
int w = xy;
while (w >= 8) {
auto a = load4(in, xy);
auto b = load4(in + 4*xy, xy);
auto c = load4(in + 8*xy, xy);
auto d = load4(in + 12*xy, xy);
in += 8;
out = store4(a,b,c,d, out);
w -= 8;
}
}
Handling the excess (xy & 7 != 0) can be done by processing one full block aligned at in_ptr + xy - 8 and out_ptr + xy * 16 - 8*16.

Related

How to optimize this OpenCL kernel?

I'm working on a project and I've got some problems with this OpenCL kernel :-(
__kernel void gemm_fast_5(
__global double *ar, __global double *br, __global double *cr,
__global double *pr, __global double *ur,
unsigned long c, unsigned long c2,
unsigned long c3, unsigned long c4,
unsigned long c5, unsigned long m,
unsigned char com
){
unsigned long i = get_global_id(0);
unsigned long j = get_global_id(1);
unsigned long x = get_local_id(0);
unsigned long y = get_local_id(1);
unsigned long cur = i*c3 + j, rl, rl2, rl3;
#if ks == 1 || ks == 2 || ks == 3 || ks == 4
unsigned long rl4;
#endif
#if ks == 2
rl = (i << 1)*c;
#elif ks == 3
rl = ((i << 1) + 1)*c;
#else
rl = i*c;
#endif
__local double ut, pt;
if (x == 0) pt = pr[i*c4 + ks];
if (y == 0) ut = ur[j*c5 + ks];
double aa = 0.0;
double bb, cc;
double dd, ee;
for (unsigned long k=0; k<m; k++){
#if ks == 1 || ks == 4
rl3 = (k << 1) + 1; rl4 = (k << 2) + 3;
bb = ar[rl + rl3 - 1]; cc = ar[rl + rl3];
dd = br[rl2 + rl4 - 1]; ee = br[rl2 + rl4 - 3];
#elif ks == 2 || ks == 3
rl3 = (k << 2) + 3; rl4 = (k << 1) + 1;
bb = ar[rl + rl3 - 3]; cc = ar[rl + rl3 - 2];
dd = br[rl2 + rl4]; ee = br[rl2 + rl4 - 1];
#else
rl3 = (k << 1) + 1;
bb = ar[rl + rl3 - 1]; cc = ar[rl + rl3];
dd = br[rl2 + rl3]; ee = br[rl2 + rl3 - 1];
#endif
aa += (bb + dd)*(cc + ee);
}
cr[cur] = aa - pt - ut;
}
While working, I noticed that if I delete the last line, the kernel takes 6 times less to run even changing the last line with cr[cur] = 5.0 - pt - ut; for example.
Shouldn't it take the same, or something similar at least?
Even looking for an answer, taking advantage of the fact that I have CPU and GPU, I have tried in several runtime (PoCL and opencl-amd) and the same thing happens :-/
I would be very grateful if someone would give me a hand in making me understand why this happens. I don't understand :"v
All the operations inside the loop do not have side effects, you only read from those __global pointers, and you calculate some temporary values that in the end get accumulated into aa through that final aa += .... In other words, the sole purpose of that loop is to calculate the value of aa.
Therefore, if you remove aa from the last line (outside the loop), all the operations inside the loop are completely useless, and you end up with a loop that does nothing except reading some values and updating local variables that will get discarded at function return. Compiling the above code with optimizations enabled (which I assume you are doing, otherwise your question wouldn't make much sense), the compiler is very likely to just get rid of the entire loop. Hence, the code without that final aa runs a lot faster.
Here's a GCC example (adapted removing CUDA annotations), where you can see that even the lowest level of optimization (-O1) removes the entire body of the loop, leaving only comparisons and the incrementing of i. With -O2, the whole loop is removed.

Why do i get weird artefacts when changing pixels of an SDL_Surface?

So I am trying to edit every pixels of an SDL_Surface to apply the grayscaling formula. But when running my code I get an area of the width of my entire screen and the height of the surface filled with weird RGB stripes.
void grayscale32(Uint8 *pixels, SDL_PixelFormat *format, int width, int height,
int pitch) {
Uint32 *targetPixel = NULL;
Uint8 r, g, b, gray;
for (int y = 0; y * pitch < height * width * 3; y++) {
for (int x = 0; x < pitch / 4; x++) {
targetPixel = pixels + y * pitch + x * sizeof(*targetPixel);
SDL_GetRGB(*targetPixel, format, &r, &g, &b);
gray = 0.21 * r + 0.72 * g + 0.07 * b;
*targetPixel = SDL_MapRGB(format, gray, gray, gray);
}
}
}
I suppose it's a matter of Byte when converting between Uint8 and Uint32 back and forth but i don't know exactly why. I tried passing Uint8 *pixels as a Uint32 * but it didn't fix it and caused a segmentation fault.
You have vertical banding and each row lines up (vs. stairstepping). So, I believe that your use of 3 is correct (i.e. the pixels are 3 bytes each)
This can/could be caused by a few things.
You could be off by one byte in the row address.
You could be wrapping/truncating pixel color values (i.e. you need saturation math). That is, (e.g.) if you have a calculated gray value of 256, it will wrap/truncate to 0.
I think there's a simpler way to index into the pixel array.
Anyway, here's some refactored code. It's not been tested but it should give you something to look at.
I'm assuming the format is RGB (3 bytes / pixel) [vs RGBA with 4 bytes / pixel] and pitch is the number of bytes per row.
#include <SDL2/SDL.h>
typedef unsigned char Uint8;
typedef unsigned int Uint32;
static inline Uint8
grayof(Uint8 r, Uint8 g, Uint8 b)
{
Uint8 gray;
#if ORIG
gray = 0.21 * r + 0.72 * g + 0.07 * b;
#else
Uint32 acc = 0;
// use scaled integer arithmetic (vs. float)
acc += 210 * (Uint32) r;
acc += 720 * (Uint32) g;
acc += 70 * (Uint32) b;
acc /= 1000;
// saturation math
// (e.g.) prevent pixel value of 256 from wrapping to 1
if (acc > 255)
acc = 255;
gray = acc;
#endif
return gray;
}
void
grayscale32(Uint8 *pixels, SDL_PixelFormat *format,
int width, int height, int pitch)
{
Uint8 *byteptr;
Uint32 *targetPixel;
Uint8 r, g, b, gray;
for (int y = 0; y < height; ++y) {
byteptr = &pixels[y * pitch];
targetPixel = (Uint32 *) byteptr;
for (int x = 0; x < width; ++x, ++targetPixel) {
SDL_GetRGB(*targetPixel, format, &r, &g, &b);
gray = grayof(r, g, b);
*targetPixel = SDL_MapRGB(format, gray, gray, gray);
}
}
}
Without more information [or the original image], it's difficult to tell exactly what the format and geometry are.
Although less likely, here's an alternate indexing scheme:
void
grayscale32(Uint8 *pixels, SDL_PixelFormat *format,
int width, int height, int pitch)
{
Uint8 *byteptr;
Uint32 *targetPixel;
Uint8 r, g, b, gray;
for (int y = 0; y < height; ++y) {
byteptr = &pixels[y * pitch];
for (int x = 0; x < width; ++x, byteptr += 3) {
targetPixel = (Uint32 *) byteptr;
SDL_GetRGB(*targetPixel, format, &r, &g, &b);
gray = grayof(r, g, b);
*targetPixel = SDL_MapRGB(format, gray, gray, gray);
}
}
}
UPDATE:
I don't think the saturation check is really required here since the factors in the equation add up to 1, so even with an original value of rgb(255, 255, 255) we get a gray of 255.
Yes, you are correct about not needing the sat math. I was being conservative [because I was too lazy to check the factor values :-)].
The first one still gave me stripes despite using the grayof function, but the second one worked perfectly fine by itself.
Okay, this means that each pixel is 3 bytes [R/G/B]. I wasn't sure whether it was 4 bytes with a format of R/G/B/A where A is the alpha value.
But, given that the memory format is 3 byte/RGB this presents an issue with targetPixel being Uint32 *.
That's because when doing *targetPixel = ...;, it's storing 4 bytes but the loop increment is 3. This means that a given store is bleeding one byte into the next pixel area.
This would look something like:
Memory layout:
| 0 3 6 9
| R G B | R G B | R G B | R G B |
Store progression:
| 1 2 3 | 4
| 1 2 3 | 4
| 1 2 3 | 4
| 1 2 3 | 4
So, effectively the second store is not getting the original R value
It may seem/look okay, but I suspect that the resultant gray values are a bit off.
Here's a version that may fix the problem. You may need to compile with -DALT=1 if the R and B values seem to be reversed. Try it both ways: with/without.
#include <SDL2/SDL.h>
typedef unsigned char Uint8;
typedef unsigned int Uint32;
#ifndef ALT
#define ALT 0
#endif
enum {
#if ALT
OFF_R = 2,
OFF_G = 1,
OFF_B = 0,
#else
OFF_R = 0,
OFF_G = 1,
OFF_B = 2,
#endif
PIXBYTES = 3
};
static inline Uint8
grayof(Uint8 r, Uint8 g, Uint8 b)
{
Uint8 gray;
#if ORIG
gray = 0.21 * r + 0.72 * g + 0.07 * b;
#else
Uint32 acc = 0;
// use scaled integer arithmetic (vs. float)
acc += 210 * (Uint32) r;
acc += 720 * (Uint32) g;
acc += 70 * (Uint32) b;
acc /= 1000;
// saturation math
// (e.g.) prevent pixel value of 256 from wrapping to 1
#if SAT
if (acc > 255)
acc = 255;
#endif
gray = acc;
#endif
return gray;
}
void
grayscale32(Uint8 *pixels, SDL_PixelFormat *format,
int width, int height, int pitch)
{
Uint8 *byteptr;
Uint8 *bytelim;
Uint8 gray;
for (int y = 0; y < height; ++y) {
byteptr = &pixels[y * pitch];
bytelim = &byteptr[width * PIXBYTES];
for (; byteptr < bytelim; byteptr += PIXBYTES) {
gray = grayof(byteptr[OFF_R], byteptr[OFF_G], byteptr[OFF_B]);
byteptr[OFF_R] = gray;
byteptr[OFF_G] = gray;
byteptr[OFF_B] = gray;
}
}
}

How do I implement a direct form II transfer function?

I need to implement a transfer function from direct form II. Here's a description of what direct form II is. I'm allowed to choose what coefficents are to be included.
So far my code looks like this:
int main(void)
{
int l, m;
l = dirii2(2); // l = -82;
//m = directii(2);
return l;
}
int dirii2(int x)
{
int y = 0; // output // 3 is constant of array size.
static int v[3] = {6,4,0};
int b[3] = {3,5,2}, a[2] = {3,6};
int q0, q1, q2;
for (int i = 0; i < 3 ; i++) {
q0 = x; // q0 = x = 2
q1 = a[2-2]*v[2-1]; // q1 = 3 * 6 = 18
q2 = a[2-1]*v[2-2]; // q2 = 6 * 4 = 24
v[2] = q0 - q1 - q2;// v(n) = x(n) - a1*v(n-1) - a2v(n-2); // v = 2 - 18 - 24 = -40
y =+ b[i]*v[2-i];// + b[1]*v[2-1]+b[2]*v[2-2];
v[0] = v[1];
v[1] = v[2];
}
return y;
}
When I run the code the result y gains or loses values rapidly like it's getting unstable:
First iteration y becomes -138.
Second iteration y becomes 230.
Third iteration y becomes -92.
Did I write the code right?

How to calculate inverse modular exponentation in c?

I want to take modular inverse(k≥1) of integer and then multiply the result to another integer, as explain in following expression:
result=((x^(-k)))*y mod z
How can i implement this expression, where k≥1?
You need to define four function:
uint64_t modular_exponentiation(uint64_t x, uint64_t y, uint64_t z)
{
uint64_t res = 1;
x = x % z;
while (y > 0)
{
if (y & 1)
res = (res*x) % p;
y = y>>1; // y = y/2
x = (x*x) % z;
}
return res;
}
uint64_t moduloMultiplication(uint64_t a, uint64_t b,uint64_t z)
{
uint64_t res = 0;
a %= z;
while (b)
{
if (b & 1)
res = (res + a) % z;
a = (2 * a) % p;
b >>= 1; // b = b / 2
}
return res;
}
void extendedEuclid(uint64_t A, uint64_t B)
{
uint64_t temp;
if(B == 0)
{
d = A;
x = 1;
y = 0;
}
else
{
extendedEuclid(B,A%B);
temp = x;
x = y;
y = temp - (A/B)*y;
}
}
int modInverse(uint64_t A, uint64_t M)
{
extendedEuclid(A,M);
if (x < 0)
x += M;
return (x);
}
In main():
uint64_t result=0x00;
result=modular_exponentiation(x,k,z); // (x^k) mod z
result=modInverse(result,z); // ((x^k)^-1) mod z == x^(-k) mod z
result=moduloMultiplication(result,y,z);// x^(-k) * y mod z
You will need the extended greatest common divisor to compute the inverse of x for modulus z. When x and zare relatively prime you have a * x + b * z = 1 = gcd(x, z). And thus, a * x = 1 - b * z or a * x = 1 mod z, and a is the inverse to x in the modulus z.
Now you may compute result with x^-1 = a mod z:
result = power(a, k) * y % z
with ordinary integer arithmetic in C, where power() is the ordinary integer exponentiation.
Since the coefficients in such calculations can become very large very quickly, it is better to use ready-made libraries (e.g. gmp).
You can try the mod_inv C function :
// return a modular multiplicative inverse of n with respect to the modulus.
// return 0 if the linear congruence has no solutions.
unsigned mod_inv(unsigned ra, unsigned rb) {
unsigned rc, sa = 1, sb = 0, sc, i = 0;
if (rb > 1) do {
rc = ra % rb;
sc = sa - (ra / rb) * sb;
sa = sb, sb = sc;
ra = rb, rb = rc;
} while (++i, rc);
sa *= (i *= ra == 1) != 0;
sa += (i & 1) * sb;
return sa;
}
This is basically the standard algorithm, when n = 1 and mod = 0 the output is 0, not 1, i think we have not many computations to execute modulo 0.
The modular multiplicative inverse of an integer N modulo m is an integer n such as the inverse of N modulo m equals n, if a modular inverse exists then it is unique. To calculate the value of the modulo inverse, use the extended euclidean algorithm which finds solutions to the Bezout identity.
Example of usage :
#include <assert.h>
int main(void) {
unsigned n, mod, res;
n = 52, mod = 107;
res = mod_inv(n, mod);
assert(res == 35); // 35 is a solution of the linear congruence.
n = 66, mod = 123;
res = mod_inv(n, mod);
assert(res == 0); // 66 does note have an inverse modulo 123.
}
/*
n = 7 and mod = 45 then res = 13 so 1 == ( 13 * 7 ) % 45
n = 52 and mod = 107 then res = 35 so 1 == ( 35 * 52 ) % 107
n = 213 and mod = 155 then res = 147 so 1 == ( 147 * 213 ) % 155
n = 392 and mod = 45 then res = 38 so 1 == ( 38 * 392 ) % 45
n = 687 and mod = 662 then res = 53 so 1 == ( 53 * 687 ) % 662
n = 451 and mod = 799 then res = 512 so 1 == ( 512 * 451 ) % 799
n = 1630 and mod = 259 then res = 167 so 1 == ( 167 * 1630 ) % 259
n = 4277 and mod = 4722 then res = 191 so 1 == ( 191 * 4277 ) % 4722
*/
Source

openCV grayscale / color addressing pixel

I have written a block matching algorithm in c++ using opencv for my thesis .
It is working on grayscale pictures and addresses the IPLImage by his absolute pixeladress.
I have to devide the IPLImage in blocks of the same size (8x8 pxls). In order to access the pixel values within the blocks, I compute the pixeladress and access the pixel value in this way:
for (int yBlock = 0; yBlock < maxYBlocks; yBlock++){
for (int xBlock = 0; yxlock < maxXBlocks; xBlock++){
for (int yPixel = 0; yPixel < 8; yPixel++){
for (int xPixel = 0; xPixel < 8; xPixel++){
pixelAdress = yBlock*imageWidth*8 + xBlock*8 + yPixel*imageWidth + xPixel;
unsigned char* imagePointer = (unsigned char*)(img->imageData);
pixelValue = imagePointer[pixelAdress];
}
}
}
}
I do NOT really itterate over rows and cols and it works great!
Now I have a colored IPLImage (no grayscale) and don't know how to access the r, g, b pixelvalues.
I found this on this forum
for( row = 0; row < img->height; row++ ){
for ( col = 0; col < img->width; col++ ){
b = (int)img->imageData[img->widthStep * row + col * 3];
g = (int)img->imageData[img->widthStep * row + col * 3 + 1];
r = (int)img->imageData[img->widthStep * row + col * 3 + 2];
}
}
but I'm not sure how to use it on my computed pixelAdress. Is it correct just to multiply it by 3 (because I do not iterate over rows and the add 0, 1 or 2? For example:
pixelValueR = imagePointer[pixelAdress*3 + 2];
pixelValueG = imagePointer[pixelAdress*3 + 1];
pixelValueB = imagePointer[pixelAdress*3 + 0];
or do I have to use widthStep where I used imageWidth before, like this:
pixelAdressR = pixelAdress = yBlock*img->widthStep*8 + xBlock*8*3 + yPixel*img->widthStep + xPixel*3 + 2;
pixelAdressG = pixelAdress = yBlock*img->widthStep*8 + xBlock*8*3 + yPixel*img->widthStep + xPixel*3 + 1;
pixelAdressB = pixelAdress = yBlock*img->widthStep*8 + xBlock*8*3 + yPixel*img->widthStep + xPixel*3;
and so access
pixelValueR = imagePointer[pixelAdressR];
pixelValueG = imagePointer[pixelAdressG];
pixelValueB = imagePointer[pixelAdressB];
In case of a multi channel Mat (BGR in this example) you can access the single pixel by using, as described here
Vec3b intensity = img.at<Vec3b>(y, x);
uchar blue = intensity.val[0];
uchar green = intensity.val[1];
uchar red = intensity.val[2];
not sure about your whole algorithm and can't test it at the moment, but for IplImages, the memory is aligned as this:
1. row
baseadress + 0 = b of [0]
baseadress + 1 = g of [0]
baseadress + 2 = r of [0]
baseadress + 3 = b of [1]
etc
2. row
baseadress + widthStep + 0 = b
baseadress + widthStep + 1 = g
baseadress + widthStep + 2 = r
so if you have have n*m blocks of size 8x8 unsigned char bgr data and you want to loop over variables [x,y] in block [bx,by] you can do it like this:
baseadress + (by*8+ y_in_block)*widthStep + (bx*8+x)*3 +0 = b
baseadress + (by*8+ y_in_block)*widthStep + (bx*8+x)*3 +1 = g
baseadress + (by*8+ y_in_block)*widthStep + (bx*8+x)*3 +2 = r
since row by*8+y is adressbaseadress + (by*8+ y_in_block)*widthStep`
and column bx*8+x is adress offset (bx*8+x)*3
For Mat (e.g. Mat img)
Grayscale (8UC1):
uchar intensity = img.at<uchar>(y, x);
Color image (BGR color ordering, the default format returned by imread):
Vec3b intensity = img.at<Vec3b>(y, x);
uchar blue = intensity.val[0];
uchar green = intensity.val[1];
uchar red = intensity.val[2];
For IplImage (e.g. IplImage* img)
Grayscale:
uchar intensity = CV_IMAGE_ELEM(img, uchar, h, w);
Color image:
uchar blue = CV_IMAGE_ELEM(img, uchar, y, x*3);
uchar green = CV_IMAGE_ELEM(img, uchar, y, x*3+1);
uchar red = CV_IMAGE_ELEM(img, uchar, y, x*3+2);

Resources