Pixman compositing vs. alpha blending - c

Below is a small test program (works on little endian machines).
As is, the result is already strange to me :
in: r=20 g=20 b=80 a=FF (#202080FF, ok!)
out: r=90 g=90 b=C0 a=FF (#9090C0FF, strange...)
Where as I expected the fill color #FFFFFFFF x the mask 0x80 = #FFFFFF80 and so an output of #9090FFFF...
Now, if I set the fill color to #FFFFFF80 by changing "cfill.alpha = uint16_t(0x80) << 8;" , the result seems really wrong :
in: r=20 g=20 b=80 a=FF
out: r=98 g=98 b=E0 a=FF
I would expect fill x mask => #FFFFFF40 and thus an output of: #606060C0FF.
I especially do not understand how a lower alpha input color can end up in a lighter output on the target image.
What I am doing wrong here ?
Is there another PIXMAP_OP_xxx that would work as I expect ?
Thanks.
#include <stdlib.h>
#include <stdio.h>
#include "pixman.h"
union C {
uint32_t value;
struct RGBA8888 {
uint8_t a;
uint8_t b;
uint8_t g;
uint8_t r;
} rgba;
};
int main()
{
// create target image full with r=0x20 g=0x20 b=0x80 a=0xFF
size_t w = 100; // multiple of 4 for alignment
size_t h = 100;
C *target = (C*)malloc(w * h * sizeof(C));
for(size_t i = 0; i < w * h; ++i)
target[i].value = 0x202080FF;
printf("in: r=%02X g=%02X b=%02X a=%02X\n",
target[0].rgba.r, target[0].rgba.g, target[0].rgba.b, target[0].rgba.a);
// connect target to pixman image
pixman_image_t *ptarget = pixman_image_create_bits(PIXMAN_r8g8b8a8, w, h, (uint32_t*)target, w * sizeof(uint32_t));
// create fill
pixman_color_t cfill;
cfill.red = uint16_t(0xFF) << 8;
cfill.green = uint16_t(0xFF) << 8;
cfill.blue = uint16_t(0xFF) << 8;
cfill.alpha = uint16_t(0xFF) << 8;
pixman_image_t *pfill = pixman_image_create_solid_fill(&cfill);
// create mask with a=0x80
uint8_t *mask = (uint8_t*)malloc(w * h);
for(size_t i = 0; i < w * h; ++i)
mask[i] = 0x80;
pixman_image_t *pmask = pixman_image_create_bits(PIXMAN_a8, w, h, (uint32_t*)mask, w);
// do compositing
pixman_image_composite(
PIXMAN_OP_OVER,
pfill, pmask, ptarget,
// src_x, src_y
0, 0,
// mask_x, mask_y
0, 0,
// dest_x, dest_y, width, height
0, 0, w, h);
// display one pixel of target
printf("out: r=%02X g=%02X b=%02X a=%02X\n",
target[0].rgba.r, target[0].rgba.g, target[0].rgba.b, target[0].rgba.a);
}

I turns out that Pixman works with premultiplied alpha !
So the white with alpha should be #80808080 and subsequently #40404040 and not #FFFFFF80 and #FFFFFF40.
Hope it helps somebody else ;)

Related

Looking for performance improvement of NEON code to match clipping area on the screen

Here is my test code to find 1st clipping area on the screen.
Two subroutines and dummy loops in the code to compare the performance of them.
point_in_neon (NEON version) and point_in (Regular version) does the same thing:
find out the first clipping area (contains given point) in given list and return -1 if there is no matching area.
I expected NEON version is faster than regular version.
Unfortunately, it is slower than regular version. Is there another way to speed it up?
The compiler command is:
${CC} -O2 -ftree-vectorize -o vcomp vcomp.c
Thanks,
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <sys/time.h>
#include <arm_neon.h>
#define WIDTH (4096)
#define HEIGHT (4096)
#define CLIPS (32)
static inline uint64_t now(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return tv.tv_sec*1000000+tv.tv_usec;
}
typedef struct _rect_t {
int32_t x;
int32_t y;
uint32_t width;
uint32_t height;
} rect_t;
typedef struct _point_t {
int32_t x;
int32_t y;
} point_t;
int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
const int32_t right[4]={
rs[0].x+rs[0].width-1,
rs[1].x+rs[1].width-1,
rs[2].x+rs[2].width-1,
rs[3].x+rs[3].width-1
}, bottom[4]={
rs[0].y+rs[0].height-1,
rs[1].y+rs[1].height-1,
rs[2].y+rs[2].height-1,
rs[3].y+rs[3].height-1
};
int32x4_t p, r;
uint32x4_t t;
uint32_t res[4];
//p = <Xp, Xp, Xp, Xp>
p=vld1q_dup_s32(&pt->x);
//r = <Left0, Left1, Left2, Left3>
r=vld1q_lane_s32(&rs[0].x, r, 0);
r=vld1q_lane_s32(&rs[1].x, r, 1);
r=vld1q_lane_s32(&rs[2].x, r, 2);
r=vld1q_lane_s32(&rs[3].x, r, 3);
//t = (p >= r)
t=vcgeq_s32(p, r);
//r = <Right0, Right1, Right2, Right3>
r=vld1q_s32(&right);
//t = t & (r >= p)
t=vandq_u32(t, vcgeq_s32(r, p));
//p = <Yp, Yp, Yp, Yp>
p=vld1q_dup_s32(&pt->y);
//r = <Top0, Top1, Top2, Top3>
r=vld1q_lane_s32(&rs[0].y, r, 0);
r=vld1q_lane_s32(&rs[1].y, r, 1);
r=vld1q_lane_s32(&rs[2].y, r, 2);
r=vld1q_lane_s32(&rs[3].y, r, 3);
//t = t & (p >= r)
t=vandq_u32(t, vcgeq_s32(p, r));
//r = <Bottom0, Bottom1, Bottom2, Bottom3>
r=vld1q_s32(&bottom);
//t = t & (r >= p)
t=vandq_u32(t, vcgeq_s32(r, p));
vst1q_u32(res, t);
if(res[0])
return 0;
else if(res[1])
return 1;
else if(res[2])
return 2;
else if(res[3])
return 3;
return -1;
}
int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
int32_t i;
for(i=0;i<len;i++) {
int32_t right=rs[i].x+rs[i].width-1,
bottom=rs[i].y+rs[i].height-1;
if(pt->x>=rs[i].x && pt->x<=right &&
pt->y>=rs[i].y && pt->y<=bottom)
return i;
}
return -1;
}
int32_t main(int32_t argc, char *argv[]) {
rect_t rs[CLIPS];
int32_t i, j;
uint64_t ts0, ts1;
int32_t res[2][CLIPS];
srand((unsigned int)time(NULL));
for(i=0;i<CLIPS;i++) {
rs[i].x=rand()%WIDTH;
rs[i].y=rand()%HEIGHT;
rs[i].width=rand()%WIDTH;
rs[i].height=rand()%HEIGHT;
}
memset(res, 0, sizeof(res));
ts0=now();
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
point_t p={i, j};
int32_t idx=point_in(&p, rs, CLIPS);
if(idx>=0)
res[0][idx]=1;
}
}
ts0=now()-ts0;
ts1=now();
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
int32_t k, idx;
point_t p={i, j};
for(k=0, idx=-1;k<CLIPS/4;k++) {
idx=point_in_neon(&p, &rs[k*4]);
if(idx>=0)
break;
}
if(idx>=0)
res[1][k*4+idx]=1;
}
}
ts1=now()-ts1;
/*
for(i=0;i<CLIPS;i++) {
if(res[0][i]!=res[1][i]) {
printf("error.\n");
return 1;
}
}
*/
printf("regular = %lu\n", ts0);
printf("neon = %lu\n", ts1);
return 0;
}
According to Peter Cordes's suggestion, I replaced data loding parts of point_in_neon subroutine with vld4q_s32 intrinsic and subsequent right and bottom calculation can be vectorized. Now the code is shorter and faster than regular version.
int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
int32x4x4_t r;
int32x4_t right, bottom, p;
uint32x4_t t;
uint32_t res[4];
/*
r.val[0] = <X0, X1, X2, X3>
r.val[1] = <Y0, Y1, Y2, Y3>
r.val[2] = <Width0, Width1, Width2, Width3>
r.val[3] = <Height0, Height1, Height2, Height3>
*/
r=vld4q_s32(rs);
//right = <Right0, Right1, Right2, Right3>
right=vsubq_s32(vaddq_s32(r.val[0], r.val[2]), vdupq_n_s32(1));
//bottom = <Bottom0, Bottom1, Bottom2, Bottom3>
bottom=vsubq_s32(vaddq_s32(r.val[1], r.val[3]), vdupq_n_s32(1));
//p = <Xp, Xp, Xp, Xp>
p=vld1q_dup_s32(&pt->x);
//t = (p >= left)
t=vcgeq_s32(p, r.val[0]);
//t = t & (right >= p)
t=vandq_u32(t, vcgeq_s32(right, p));
//p = <Yp, Yp, Yp, Yp>
p=vld1q_dup_s32(&pt->y);
//t = t & (p >= top)
t=vandq_u32(t, vcgeq_s32(p, r.val[1]));
//t = t & (r >= bottom)
t=vandq_u32(t, vcgeq_s32(bottom, p));
vst1q_u32(res, t);
if(res[0])
return 0;
else if(res[1])
return 1;
else if(res[2])
return 2;
else if(res[3])
return 3;
return -1;
}
Starting with your original point_in method, we can clean up a little bit here by removing the -1's, and changing <= to <.
int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
int32_t i;
for(i=0; i < len; i++)
{
// this is pointless - change your data structures so that
// the rect stores minx/maxx, miny/maxy instead!
int32_t right = rs[i].x + rs[i].width;
int32_t bottom= rs[i].y + rs[i].height;
bool cmp0 = pt->x >= rs[i].x;
bool cmp1 = pt->y >= rs[i].y;
bool cmp2 = pt->x < right;
bool cmp3 = pt->y < bottom;
if(cmp0 & cmp1 & cmp2 & cmp3)
return i;
}
return -1;
}
Next obvious thing to point out:
// your screen size...
#define WIDTH (4096)
#define HEIGHT (4096)
// yet your structures use uint32 as storage???
typedef struct _rect_t {
int32_t x;
int32_t y;
uint32_t width;
uint32_t height;
} rect_t;
typedef struct _point_t {
int32_t x;
int32_t y;
} point_t;
If you can get away with using 16bit integers, this will go at twice the speed (because you can fit 8x 16bit numbers in a SIMD register, v.s. 4x 32bit). Whilst we're at it, we might as well change the data layout to structure of array at the same time.
I'm also going to hoist the pointless p.x + width out, and store it as xmax/ymax instead (removes duplicated computation in your loops).
typedef struct rect_x8_t {
int16x8_t x;
int16x8_t y;
int16x8_t xmax; //< x + width
int16x8_t ymax; //< y + height
} rect_x8_t;
typedef struct point_x8_t {
int16x8_t x;
int16x8_t y;
} point_x8_t;
On the assumption you don't have a number of clips that's divisible by 8, we'll need to pad the number slightly (not a big deal)
// assuming this has already been initialised
rect_t rs[CLIPS];
// how many batches of 8 do we need?
uint32_t CLIPS8 = (CLIPS / 8) + (CLIPS & 7 ? 1 : 0);
// allocate in batches of 8
rect_x8_t rs8[CLIPS8] = {};
// I'm going to do this rubbishly as an pre-process step.
// I don't care too much about efficiency here...
for(uint32_t i = 0; i < CLIPS; ++i) {
rs8[i / 8].x[i & 7] = rs[i].x;
rs8[i / 8].y[i & 7] = rs[I].y;
rs8[i / 8].xmax[i & 7] = rs[i].x + rs[i].width;
rs8[i / 8].ymax[i & 7] = rs[i].y + rs[i].height;
}
I have a couple of concerns here:
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
// This seems wrong? Shouldn't it be p = {j, i} ?
point_t p={i, j};
int32_t idx=point_in(&p, rs, CLIPS);
// I'm not quite sure what the result says about your
// image data and clip regions???
//
// This seems like a really silly way of asking
// a simple question about the clip regions. The pixels
// don't have any effect here.
if(idx >= 0)
res[0][idx] = 1;
}
}
Anyhow, now refactoring the point_in method to use int16x8_t, we get:
inline int32_t point_in_x8(const point_x8_t pt,
const rect_x8_t* rs,
uint32_t len) {
for(int32_t i = 0; i < len; i++) {
// perform comparisons on 8 rects at a time
uint16x8_t cmp0 = vcgeq_s16(pt.x, rs[i].x);
uint16x8_t cmp1 = vcgeq_s16(pt.y, rs[i].y);
uint16x8_t cmp2 = vcltq_s16(pt.x, rs[i].xmax);
uint16x8_t cmp3 = vcltq_s16(pt.y, rs[I].ymax);
// combine to single comparison value
uint16x8_t cmp01 = vandq_u16(cmp0, cmp1);
uint16x8_t cmp23 = vandq_u16(cmp2, cmp3);
uint16x8_t cmp0123 = vandq_u16(cmp01, cmp23);
// use a horizontal max to see if any lanes are true
if(vmaxvq_u16(cmp0123)) {
for(int32_t j = 0; j < 8; ++j) {
if(cmp0123[j])
return 8*i + j;
}
}
}
return -1;
}
Any additional padded elements in the rect_x8_t structs should end up being ignored (since they should be 0/0, 0/0, which will always end up being false).
Then finally...
for(i = 0; i < HEIGHT; i++) {
point_x8_t p;
// splat the y value
p.y = vld1q_dup_s16(i);
for(j = 0; j < WIDTH; j++) {
// splat the x value
p.x = vld1q_dup_s16(j);
int32_t idx = point_in_x8(p, rs8, CLIPS8);
if(idx >= 0)
res[1][idx] = 1;
}
}
The vld4 instruction actually has a fairly high latency. Given that WIDTH * HEIGHT is actually a very big number, pre-swizzling here (as a pre-processing step) makes a lot more sense imho.
HOWEVER
This whole algorithm could be massively improved by simply ignoring the pixels, and working on CLIP regions directly.
A clip region will be false if it is entirely contained by the preceding clip regions
for(i = 0; i < CLIPS; i++) {
// if region is empty, ignore.
if(rs[i].width == 0 || rs[i].height == 0) {
res[0][i] = 0;
continue;
}
// first region will always be true (unless it's of zero size)
if(i == 0) {
res[0][1] = 1;
continue;
}
uint32_t how_many_intersect = 0;
bool entirely_contained = false;
uint32_t intersection_indices[CLIPS] = {};
// do a lazy test first.
for(j = i - 1; j >= 0; --j) {
// if the last region is entirely contained by preceding
// ones, it will be false. exit loop.
if(region_is_entirely_contained(rs[i], rs[j])) {
res[0][i] = 0;
entirely_contained = true;
j = -1; ///< break out of loop
}
else
// do the regions intersect?
if(region_intersects(rs[i], rs[j])) {
intersection_indices[how_many_intersect] = j;
++how_many_intersect;
}
}
// if one region entirely contains this clip region, skip it.
if(entirely_contained) {
continue;
}
// if you only intersect one or no regions, the result is true.
if(how_many_intersect <= 1) {
res[0][i] = 1;
continue;
}
// If you get here, the result is *probably* true, however
// you will need to split this clip region against the previous
// ones to be fully sure. If all regions are fully contained,
// the answer is false.
// I won't implement it, but something like this:
* split rs[i] against each rs[intersection_indices[]].
* Throw away the rectangles that are entirely contained.
* Each bit that remains should be tested against each rs[intersection_indices[]]
* If you find any split rectangle that isn't contained,
set to true and move on.
}

Why do i get weird artefacts when changing pixels of an SDL_Surface?

So I am trying to edit every pixels of an SDL_Surface to apply the grayscaling formula. But when running my code I get an area of the width of my entire screen and the height of the surface filled with weird RGB stripes.
void grayscale32(Uint8 *pixels, SDL_PixelFormat *format, int width, int height,
int pitch) {
Uint32 *targetPixel = NULL;
Uint8 r, g, b, gray;
for (int y = 0; y * pitch < height * width * 3; y++) {
for (int x = 0; x < pitch / 4; x++) {
targetPixel = pixels + y * pitch + x * sizeof(*targetPixel);
SDL_GetRGB(*targetPixel, format, &r, &g, &b);
gray = 0.21 * r + 0.72 * g + 0.07 * b;
*targetPixel = SDL_MapRGB(format, gray, gray, gray);
}
}
}
I suppose it's a matter of Byte when converting between Uint8 and Uint32 back and forth but i don't know exactly why. I tried passing Uint8 *pixels as a Uint32 * but it didn't fix it and caused a segmentation fault.
You have vertical banding and each row lines up (vs. stairstepping). So, I believe that your use of 3 is correct (i.e. the pixels are 3 bytes each)
This can/could be caused by a few things.
You could be off by one byte in the row address.
You could be wrapping/truncating pixel color values (i.e. you need saturation math). That is, (e.g.) if you have a calculated gray value of 256, it will wrap/truncate to 0.
I think there's a simpler way to index into the pixel array.
Anyway, here's some refactored code. It's not been tested but it should give you something to look at.
I'm assuming the format is RGB (3 bytes / pixel) [vs RGBA with 4 bytes / pixel] and pitch is the number of bytes per row.
#include <SDL2/SDL.h>
typedef unsigned char Uint8;
typedef unsigned int Uint32;
static inline Uint8
grayof(Uint8 r, Uint8 g, Uint8 b)
{
Uint8 gray;
#if ORIG
gray = 0.21 * r + 0.72 * g + 0.07 * b;
#else
Uint32 acc = 0;
// use scaled integer arithmetic (vs. float)
acc += 210 * (Uint32) r;
acc += 720 * (Uint32) g;
acc += 70 * (Uint32) b;
acc /= 1000;
// saturation math
// (e.g.) prevent pixel value of 256 from wrapping to 1
if (acc > 255)
acc = 255;
gray = acc;
#endif
return gray;
}
void
grayscale32(Uint8 *pixels, SDL_PixelFormat *format,
int width, int height, int pitch)
{
Uint8 *byteptr;
Uint32 *targetPixel;
Uint8 r, g, b, gray;
for (int y = 0; y < height; ++y) {
byteptr = &pixels[y * pitch];
targetPixel = (Uint32 *) byteptr;
for (int x = 0; x < width; ++x, ++targetPixel) {
SDL_GetRGB(*targetPixel, format, &r, &g, &b);
gray = grayof(r, g, b);
*targetPixel = SDL_MapRGB(format, gray, gray, gray);
}
}
}
Without more information [or the original image], it's difficult to tell exactly what the format and geometry are.
Although less likely, here's an alternate indexing scheme:
void
grayscale32(Uint8 *pixels, SDL_PixelFormat *format,
int width, int height, int pitch)
{
Uint8 *byteptr;
Uint32 *targetPixel;
Uint8 r, g, b, gray;
for (int y = 0; y < height; ++y) {
byteptr = &pixels[y * pitch];
for (int x = 0; x < width; ++x, byteptr += 3) {
targetPixel = (Uint32 *) byteptr;
SDL_GetRGB(*targetPixel, format, &r, &g, &b);
gray = grayof(r, g, b);
*targetPixel = SDL_MapRGB(format, gray, gray, gray);
}
}
}
UPDATE:
I don't think the saturation check is really required here since the factors in the equation add up to 1, so even with an original value of rgb(255, 255, 255) we get a gray of 255.
Yes, you are correct about not needing the sat math. I was being conservative [because I was too lazy to check the factor values :-)].
The first one still gave me stripes despite using the grayof function, but the second one worked perfectly fine by itself.
Okay, this means that each pixel is 3 bytes [R/G/B]. I wasn't sure whether it was 4 bytes with a format of R/G/B/A where A is the alpha value.
But, given that the memory format is 3 byte/RGB this presents an issue with targetPixel being Uint32 *.
That's because when doing *targetPixel = ...;, it's storing 4 bytes but the loop increment is 3. This means that a given store is bleeding one byte into the next pixel area.
This would look something like:
Memory layout:
| 0 3 6 9
| R G B | R G B | R G B | R G B |
Store progression:
| 1 2 3 | 4
| 1 2 3 | 4
| 1 2 3 | 4
| 1 2 3 | 4
So, effectively the second store is not getting the original R value
It may seem/look okay, but I suspect that the resultant gray values are a bit off.
Here's a version that may fix the problem. You may need to compile with -DALT=1 if the R and B values seem to be reversed. Try it both ways: with/without.
#include <SDL2/SDL.h>
typedef unsigned char Uint8;
typedef unsigned int Uint32;
#ifndef ALT
#define ALT 0
#endif
enum {
#if ALT
OFF_R = 2,
OFF_G = 1,
OFF_B = 0,
#else
OFF_R = 0,
OFF_G = 1,
OFF_B = 2,
#endif
PIXBYTES = 3
};
static inline Uint8
grayof(Uint8 r, Uint8 g, Uint8 b)
{
Uint8 gray;
#if ORIG
gray = 0.21 * r + 0.72 * g + 0.07 * b;
#else
Uint32 acc = 0;
// use scaled integer arithmetic (vs. float)
acc += 210 * (Uint32) r;
acc += 720 * (Uint32) g;
acc += 70 * (Uint32) b;
acc /= 1000;
// saturation math
// (e.g.) prevent pixel value of 256 from wrapping to 1
#if SAT
if (acc > 255)
acc = 255;
#endif
gray = acc;
#endif
return gray;
}
void
grayscale32(Uint8 *pixels, SDL_PixelFormat *format,
int width, int height, int pitch)
{
Uint8 *byteptr;
Uint8 *bytelim;
Uint8 gray;
for (int y = 0; y < height; ++y) {
byteptr = &pixels[y * pitch];
bytelim = &byteptr[width * PIXBYTES];
for (; byteptr < bytelim; byteptr += PIXBYTES) {
gray = grayof(byteptr[OFF_R], byteptr[OFF_G], byteptr[OFF_B]);
byteptr[OFF_R] = gray;
byteptr[OFF_G] = gray;
byteptr[OFF_B] = gray;
}
}
}

how to create bitmap in C and compile with gcc

i decided to learn C, and i try to follow this tutorial http://ricardolovelace.com/creating-bitmap-images-with-c-on-windows.html
but when i try to compile my code with gcc as this >gcc -Wall testc o app
he doesn't know type_rgb, can i define this type and how? and where in my code ?
#include <stdio.h>
struct rgb_data {
float r, g, b;
};
void save_bitmap( const char *file_name, int width, int height, int dpi, type_rgb *pixel_data);
/*
next steps of the tutorial
*/
rgb_data *pixels = new rgb_data[width * height];
for( int x = 0; x < width; x++)
{
for(int y = 0; y < height; y++)
int a = y * width +x;
{
if ((x > 50 && x < 350) && (y > y && y < 350))
{
pixels[a].r = 255;
pixels[a].g = 255;
pixels[a].b = 0;
}else{
pixels[a].r = 55;
pixels[a].g = 55;
pixels[a].b = 55;
}
}
}
save_bitmap("black_border.bmp", width, height, dpi, pixels);
Bitmap file format is rather complicated. This is not the best way to learn C. It's better to start with something much simpler.
Having said that, the bitmap format starts with a bitmap header BITMAPFILEHEADER structure which is 14 bytes long, followed by BITMAPINFOHEADER structure 40 bytes long. These structures are defined in "Windows.h"
You have to write in various information in these structures and write them to file before writing the actual pixels.
You can have 1, 4, 8, 16, 24, and 32-bit bitmap. This is an example to read a 32-bit bitmap. This code assumes sizeof(short) is 2, sizeof(int) is 4.
int main()
{
int row, column;
int width = 100;
int height = 100;
int size = width * height * 4; //for 32-bit bitmap only
char header[54] = { 0 };
strcpy(header, "BM");
memset(&header[2], (int)(54 + size), 1);
memset(&header[10], (int)54, 1);//always 54
memset(&header[14], (int)40, 1);//always 40
memset(&header[18], (int)width, 1);
memset(&header[22], (int)height, 1);
memset(&header[26], (short)1, 1);
memset(&header[28], (short)32, 1);//32bit
memset(&header[34], (int)size, 1);//pixel size
unsigned char *pixels = malloc(size);
for(row = height - 1; row >= 0; row--) {
for(column = 0; column < width; column++) {
int p = (row * width + column) * 4;
pixels[p + 0] = 64; //blue
pixels[p + 1] = 128;//green
pixels[p + 2] = 192;//red
}
}
FILE *fout = fopen("32bit.bmp", "wb");
fwrite(header, 1, 54, fout);
fwrite(pixels, 1, size, fout);
free(pixels);
fclose(fout);
return 0;
}
Note the first pixel is blue, followed by green and read. The last pixel is not used in 32-bit bitmap. Also the height goes from bottom to top. This is another odd feature of bitmap. 24-bit bitmaps are more complicated because they need padding. 8-bit and lower will need an additional palette.
struct rgb_data {
float r, g, b;
};
float is not the right type for pixels. Each color goes from 0 to 255. This fits in unsigned char. You need instead
struct rgb_data {
unsigned r, g, b, alpha;
};
The alpha is the extra byte for 32-bit bitmap (which we won't use). Notice the size of this structure is 4. You can allocate this as
struct rgb_data *rgb = malloc(size);
Now you can access the pixels as follows:
int p = (row * width + column);
rgb[p].r = 255;
rgb[p].g = 0;
rgb[p].b = 0;
...
fwrite(rgb, 4, width * height, fout);

Issue displaying IDirect3DTexture8 after backporting from IDirect3DTexture9

I'm trying to backport someones Direct3d9 port of Quake 1 by ID software to Direct3d8 so I can port it to the original Xbox (only uses the D3D8 API).
After making the changes to use Direct3d8 it displays some mashed up pixels on the screen that appear to be in little squares :/ (see pictures).
Does anyone know whats gone wrong here? It works flawlessly with D3D9, is there some extra arguments required that I'm missing require for D3D8, rect pitch maybe?
The data been passed in is a Quake 1 .lmp 2d image file. "It consists of two integers (width and height) followed by a string of width x height bytes, each of which is an index into the Quake palette"
Its been passed to the D3D_ResampleTexture() function.
Any help would be much appreciated.
Image output using D3D8
Image output using D3D9
The code:
void D3D_ResampleTexture (image_t *src, image_t *dst)
{
int y, x , srcpos, srcbase, dstpos;
unsigned int *dstdata, *srcdata;
// take an unsigned pointer to the dest data that we'll actually fill
dstdata = (unsigned int *) dst->data;
// easier access to src data for 32 bit resampling
srcdata = (unsigned int *) src->data;
// nearest neighbour for now
for (y = 0, dstpos = 0; y < dst->height; y++)
{
srcbase = (y * src->height / dst->height) * src->width;
for (x = 0; x < dst->width; x++, dstpos++)
{
srcpos = srcbase + (x * src->width / dst->width);
if (src->flags & IMAGE_32BIT)
dstdata[dstpos] = srcdata[srcpos];
else if (src->palette)
dstdata[dstpos] = src->palette[src->data[srcpos]];
else Sys_Error ("D3D_ResampleTexture: !(flags & IMAGE_32BIT) without palette set");
}
}
}
void D3D_LoadTextureStage3 (LPDIRECT3DTEXTURE8/*9*/ *tex, image_t *image)
{
int i;
image_t scaled;
D3DLOCKED_RECT LockRect;
memset (&LockRect, 0, sizeof(D3DLOCKED_RECT));
// check scaling here first
for (scaled.width = 1; scaled.width < image->width; scaled.width *= 2);
for (scaled.height = 1; scaled.height < image->height; scaled.height *= 2);
// clamp to max texture size
if (scaled.width > /*d3d_DeviceCaps.MaxTextureWidth*/640) scaled.width = /*d3d_DeviceCaps.MaxTextureWidth*/640;
if (scaled.height > /*d3d_DeviceCaps.MaxTextureHeight*/480) scaled.height = /*d3d_DeviceCaps.MaxTextureHeight*/480;
IDirect3DDevice8/*9*/_CreateTexture(d3d_Device, scaled.width, scaled.height,
(image->flags & IMAGE_MIPMAP) ? 0 : 1,
/*(image->flags & IMAGE_MIPMAP) ? D3DUSAGE_AUTOGENMIPMAP :*/ 0,
(image->flags & IMAGE_ALPHA) ? D3DFMT_A8R8G8B8 : D3DFMT_X8R8G8B8,
D3DPOOL_MANAGED,
tex
);
// lock the texture rectangle
//(*tex)->LockRect (0, &LockRect, NULL, 0);
IDirect3DTexture8/*9*/_LockRect(*tex, 0, &LockRect, NULL, 0);
// fill it in - how we do it depends on the scaling
if (scaled.width == image->width && scaled.height == image->height)
{
// no scaling
for (i = 0; i < (scaled.width * scaled.height); i++)
{
unsigned int p;
// retrieve the correct texel - this will either be direct or a palette lookup
if (image->flags & IMAGE_32BIT)
p = ((unsigned *) image->data)[i];
else if (image->palette)
p = image->palette[image->data[i]];
else Sys_Error ("D3D_LoadTexture: !(flags & IMAGE_32BIT) without palette set");
// store it back
((unsigned *) LockRect.pBits)[i] = p;
}
}
else
{
// save out lockbits in scaled data pointer
scaled.data = (byte *) LockRect.pBits;
// resample data into the texture
D3D_ResampleTexture (image, &scaled);
}
// unlock it
//(*tex)->UnlockRect (0);
IDirect3DTexture8/*9*/_UnlockRect(*tex, 0);
// tell Direct 3D that we're going to be needing to use this managed resource shortly
//FIXME
//(*tex)->PreLoad ();
}
LPDIRECT3DTEXTURE8/*9*/ D3D_LoadTextureStage2 (image_t *image)
{
d3d_texture_t *tex;
// look for a match
// create a new one
tex = (d3d_texture_t *) malloc (sizeof (d3d_texture_t));
// link it in
tex->next = d3d_Textures;
d3d_Textures = tex;
// fill in the struct
tex->LastUsage = 0;
tex->d3d_Texture = NULL;
// copy the image
memcpy (&tex->TexImage, image, sizeof (image_t));
// upload through direct 3d
D3D_LoadTextureStage3 (&tex->d3d_Texture, image);
// return the texture we got
return tex->d3d_Texture;
}
LPDIRECT3DTEXTURE8/*9*/ D3D_LoadTexture (char *identifier, int width, int height, byte *data, /*bool*/qboolean mipmap, /*bool*/qboolean alpha)
{
image_t image;
image.data = data;
image.flags = 0;
image.height = height;
image.width = width;
image.palette = d_8to24table;
strcpy (image.identifier, identifier);
if (mipmap) image.flags |= IMAGE_MIPMAP;
if (alpha) image.flags |= IMAGE_ALPHA;
return D3D_LoadTextureStage2 (&image);
}
When you lock the texture, you have to observe the returned Pitch member of the D3DLOCKED_RECT structure. Your code is assuming that all the data is contiguous, but the Pitch can be larger than the width of a scanline in order to allow for locking a subregion and other layouts of the buffer that don't have contiguous pixels at the end of one scanline to the beginning of the next.
Look at Chapter 4 of my book "The Direct3D Graphics Pipeline" to see an example of accessing a surface and using the Pitch properly.
For anyone else that comes across this issue, it was due to the way the image was been loaded into the Xbox's memory, it needed to be swizzled.

What is the simplest RGB image format?

I am working in C on a physics experiment, Young's interference experiment and I made a program who prints to file a huge bunch of pixels:
for (i=0; i < width*width; i++)
{
fwrite(hue(raster_matrix[i]), 1, 3, file);
}
Where hue, when given a value [0..255], gives back a char * with 3 bytes, R,G,B.
I would like to put a minimal header in my image file in order to make this raw file a valid image file.
More concise, switching from:
offset
0000 : height * width : data } my data, 24bit RGB pixels
to:
offset
0000 : dword : magic \
: /* ?? */ \
0012 : dword : height } Header <--> common image file
0016 : dword : width /
: /* ?? */ /
0040 : height * width : data } my data, 24bit RGB pixels
You probably want to use the PPM format which is what you're looking for: a minimal header followed by raw RGB.
TARGA (file name extension .tga) may be the simplest widely supported binary image file format if you don't use compression and don't use any of its extensions. It's even simpler than Windows .bmp files and is supported by ImageMagick and many paint programs. It has been my go-to format when I just need to output some pixels from a throwaway program.
Here's a minimal C program to generate an image to standard output:
#include <stdio.h>
#include <string.h>
enum { width = 550, height = 400 };
int main(void) {
static unsigned char pixels[width * height * 3];
static unsigned char tga[18];
unsigned char *p;
size_t x, y;
p = pixels;
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++) {
*p++ = 255 * ((float)y / height);
*p++ = 255 * ((float)x / width);
*p++ = 255 * ((float)y / height);
}
}
tga[2] = 2;
tga[12] = 255 & width;
tga[13] = 255 & (width >> 8);
tga[14] = 255 & height;
tga[15] = 255 & (height >> 8);
tga[16] = 24;
tga[17] = 32;
return !((1 == fwrite(tga, sizeof(tga), 1, stdout)) &&
(1 == fwrite(pixels, sizeof(pixels), 1, stdout)));
}
The recently created farbfeld format is quite minimal, though there is not much software supporting it (at least so far).
Bytes │ Description
8 │ "farbfeld" magic value
4 │ 32-Bit BE unsigned integer (width)
4 │ 32-Bit BE unsigned integer (height)
(2+2+2+2)*width*height │ 4*16-Bit BE unsigned integers [RGBA] / pixel, row-major
Here's a minimal example that writes your image file with a minimal PPM header. Happily, I was able to get it to work with the exact for loop you've provided:
#include <math.h> // compile with gcc young.c -lm
#include <stdio.h>
#include <stdlib.h>
#define width 256
int main(){
int x, y, i; unsigned char raster_matrix[width*width], h[256][3];
#define WAVE(x,y) sin(sqrt( (x)*(x)+(y)*(y) ) * 30.0 / width)
#define hue(i) h[i]
/* Setup nice hue palette */
for (i = 0; i <= 85; i++){
h[i][0] = h[i+85][1] = h[i+170][2] = (i <= 42)? 255: 40+(85-i)*5;
h[i][1] = h[i+85][2] = h[i+170][0] = (i <= 42)? 40+i*5: 255;
h[i][2] = h[i+85][0] = h[i+170][1] = 40;
}
/* Setup Young's Interference image */
for (i = y = 0; y < width; y++) for (x = 0; x < width; x++)
raster_matrix[i++] = 128 + 64*(WAVE(x,y) + WAVE(x,width-y));
/* Open PPM File */
FILE *file = fopen("young.ppm", "wb"); if (!file) return -1;
/* Write PPM Header */
fprintf(file, "P6 %d %d %d\n", width, width, 255); /* width, height, maxval */
/* Write Image Data */
for (i=0; i < width*width; i++)
fwrite(hue(raster_matrix[i]), 1, 3, file);
/* Close PPM File */
fclose(file);
/* All done */
return 0;
}
The header code is based on the specs at http://netpbm.sourceforge.net/doc/ppm.html. For this image, the header is just a string of fifteen bytes: "P6 256 256 255\n".

Resources