To output a warning tone when detecting objects using darknet - c

I am working on a task to detect objects using darknet.
Regardless of the object names, if the threshold exceeds a certain level, I would like to have a warning sound output.
I don't know where to put the alarm output code.
The code below is the image.c currently in use.
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "image.h"
#include "utils.h"
#include "blas.h"
#include "dark_cuda.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <Windows.h>
#include <mmsystem.h>
#ifndef _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#define _CRT_SECURE_NO_WARNINGS
#endif
#include <math.h>
#pragma comment(lib, "winmm.lib")
#define SOUND_NAME "alarm2.wav"
#ifndef STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#endif
#ifndef STB_IMAGE_WRITE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb_image_write.h"
#endif
extern int check_mistakes;
//int windows = 0;
float colors[6][3] = { {1,0,1}, {0,0,1},{0,1,1},{0,1,0},{1,1,0},{1,0,0} };
float get_color(int c, int x, int max)
{
float ratio = ((float)x/max)*5;
int i = floor(ratio);
int j = ceil(ratio);
ratio -= i;
float r = (1-ratio) * colors[i][c] + ratio*colors[j][c];
//printf("%f\n", r);
return r;
}
static float get_pixel(image m, int x, int y, int c)
{
assert(x < m.w && y < m.h && c < m.c);
return m.data[c*m.h*m.w + y*m.w + x];
}
static float get_pixel_extend(image m, int x, int y, int c)
{
if (x < 0 || x >= m.w || y < 0 || y >= m.h) return 0;
/*
if(x < 0) x = 0;
if(x >= m.w) x = m.w-1;
if(y < 0) y = 0;
if(y >= m.h) y = m.h-1;
*/
if (c < 0 || c >= m.c) return 0;
return get_pixel(m, x, y, c);
}
static void set_pixel(image m, int x, int y, int c, float val)
{
if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
assert(x < m.w && y < m.h && c < m.c);
m.data[c*m.h*m.w + y*m.w + x] = val;
}
static void add_pixel(image m, int x, int y, int c, float val)
{
assert(x < m.w && y < m.h && c < m.c);
m.data[c*m.h*m.w + y*m.w + x] += val;
}
void composite_image(image source, image dest, int dx, int dy)
{
int x,y,k;
for(k = 0; k < source.c; ++k){
for(y = 0; y < source.h; ++y){
for(x = 0; x < source.w; ++x){
float val = get_pixel(source, x, y, k);
float val2 = get_pixel_extend(dest, dx+x, dy+y, k);
set_pixel(dest, dx+x, dy+y, k, val * val2);
}
}
}
}
image border_image(image a, int border)
{
image b = make_image(a.w + 2*border, a.h + 2*border, a.c);
int x,y,k;
for(k = 0; k < b.c; ++k){
for(y = 0; y < b.h; ++y){
for(x = 0; x < b.w; ++x){
float val = get_pixel_extend(a, x - border, y - border, k);
if(x - border < 0 || x - border >= a.w || y - border < 0 || y - border >= a.h) val = 1;
set_pixel(b, x, y, k, val);
}
}
}
return b;
}
image tile_images(image a, image b, int dx)
{
if(a.w == 0) return copy_image(b);
image c = make_image(a.w + b.w + dx, (a.h > b.h) ? a.h : b.h, (a.c > b.c) ? a.c : b.c);
fill_cpu(c.w*c.h*c.c, 1, c.data, 1);
embed_image(a, c, 0, 0);
composite_image(b, c, a.w + dx, 0);
return c;
}
image get_label(image **characters, char *string, int size)
{
if(size > 7) size = 7;
image label = make_empty_image(0,0,0);
while(*string){
image l = characters[size][(int)*string];
image n = tile_images(label, l, -size - 1 + (size+1)/2);
free_image(label);
label = n;
++string;
}
image b = border_image(label, label.h*.25);
free_image(label);
return b;
}
image get_label_v3(image **characters, char *string, int size)
{
size = size / 10;
if (size > 7) size = 7;
image label = make_empty_image(0, 0, 0);
while (*string) {
image l = characters[size][(int)*string];
image n = tile_images(label, l, -size - 1 + (size + 1) / 2);
free_image(label);
label = n;
++string;
}
image b = border_image(label, label.h*.05);
free_image(label);
return b;
}
void draw_label(image a, int r, int c, image label, const float *rgb)
{
int w = label.w;
int h = label.h;
if (r - h >= 0) r = r - h;
int i, j, k;
for(j = 0; j < h && j + r < a.h; ++j){
for(i = 0; i < w && i + c < a.w; ++i){
for(k = 0; k < label.c; ++k){
float val = get_pixel(label, i, j, k);
set_pixel(a, i+c, j+r, k, rgb[k] * val);
}
}
}
}
void draw_weighted_label(image a, int r, int c, image label, const float *rgb, const float alpha)
{
int w = label.w;
int h = label.h;
if (r - h >= 0) r = r - h;
int i, j, k;
for (j = 0; j < h && j + r < a.h; ++j) {
for (i = 0; i < w && i + c < a.w; ++i) {
for (k = 0; k < label.c; ++k) {
float val1 = get_pixel(label, i, j, k);
float val2 = get_pixel(a, i + c, j + r, k);
float val_dst = val1 * rgb[k] * alpha + val2 * (1 - alpha);
set_pixel(a, i + c, j + r, k, val_dst);
}
}
}
}
void draw_box_bw(image a, int x1, int y1, int x2, int y2, float brightness)
{
//normalize_image(a);
int i;
if (x1 < 0) x1 = 0;
if (x1 >= a.w) x1 = a.w - 1;
if (x2 < 0) x2 = 0;
if (x2 >= a.w) x2 = a.w - 1;
if (y1 < 0) y1 = 0;
if (y1 >= a.h) y1 = a.h - 1;
if (y2 < 0) y2 = 0;
if (y2 >= a.h) y2 = a.h - 1;
for (i = x1; i <= x2; ++i) {
a.data[i + y1*a.w + 0 * a.w*a.h] = brightness;
a.data[i + y2*a.w + 0 * a.w*a.h] = brightness;
}
for (i = y1; i <= y2; ++i) {
a.data[x1 + i*a.w + 0 * a.w*a.h] = brightness;
a.data[x2 + i*a.w + 0 * a.w*a.h] = brightness;
}
}
void draw_box_width_bw(image a, int x1, int y1, int x2, int y2, int w, float brightness)
{
int i;
for (i = 0; i < w; ++i) {
float alternate_color = (w % 2) ? (brightness) : (1.0 - brightness);
draw_box_bw(a, x1 + i, y1 + i, x2 - i, y2 - i, alternate_color);
}
}
void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b)
{
//normalize_image(a);
int i;
if(x1 < 0) x1 = 0;
if(x1 >= a.w) x1 = a.w-1;
if(x2 < 0) x2 = 0;
if(x2 >= a.w) x2 = a.w-1;
if(y1 < 0) y1 = 0;
if(y1 >= a.h) y1 = a.h-1;
if(y2 < 0) y2 = 0;
if(y2 >= a.h) y2 = a.h-1;
for(i = x1; i <= x2; ++i){
a.data[i + y1*a.w + 0*a.w*a.h] = r;
a.data[i + y2*a.w + 0*a.w*a.h] = r;
a.data[i + y1*a.w + 1*a.w*a.h] = g;
a.data[i + y2*a.w + 1*a.w*a.h] = g;
a.data[i + y1*a.w + 2*a.w*a.h] = b;
a.data[i + y2*a.w + 2*a.w*a.h] = b;
}
for(i = y1; i <= y2; ++i){
a.data[x1 + i*a.w + 0*a.w*a.h] = r;
a.data[x2 + i*a.w + 0*a.w*a.h] = r;
a.data[x1 + i*a.w + 1*a.w*a.h] = g;
a.data[x2 + i*a.w + 1*a.w*a.h] = g;
a.data[x1 + i*a.w + 2*a.w*a.h] = b;
a.data[x2 + i*a.w + 2*a.w*a.h] = b;
}
}
void draw_box_width(image a, int x1, int y1, int x2, int y2, int w, float r, float g, float b)
{
int i;
for(i = 0; i < w; ++i){
draw_box(a, x1+i, y1+i, x2-i, y2-i, r, g, b);
}
}
void draw_bbox(image a, box bbox, int w, float r, float g, float b)
{
int left = (bbox.x-bbox.w/2)*a.w;
int right = (bbox.x+bbox.w/2)*a.w;
int top = (bbox.y-bbox.h/2)*a.h;
int bot = (bbox.y+bbox.h/2)*a.h;
int i;
for(i = 0; i < w; ++i){
draw_box(a, left+i, top+i, right-i, bot-i, r, g, b);
}
}
image **load_alphabet()
{
int i, j;
const int nsize = 8;
image** alphabets = (image**)xcalloc(nsize, sizeof(image*));
for(j = 0; j < nsize; ++j){
alphabets[j] = (image*)xcalloc(128, sizeof(image));
for(i = 32; i < 127; ++i){
char buff[256];
sprintf(buff, "data/labels/%d_%d.png", i, j);
alphabets[j][i] = load_image_color(buff, 0, 0);
}
}
return alphabets;
}
// Creates array of detections with prob > thresh and fills best_class for them
detection_with_class* get_actual_detections(detection *dets, int dets_num, float thresh, int* selected_detections_num, char **names)
{
int selected_num = 0;
detection_with_class* result_arr = (detection_with_class*)xcalloc(dets_num, sizeof(detection_with_class));
int i;
for (i = 0; i < dets_num; ++i) {
int best_class = -1;
float best_class_prob = thresh;
int j;
for (j = 0; j < dets[i].classes; ++j) {
int show = strncmp(names[j], "dont_show", 9);
if (dets[i].prob[j] > best_class_prob && show) {
best_class = j;
best_class_prob = dets[i].prob[j];
}
}
if (best_class >= 0) {
result_arr[selected_num].det = dets[i];
result_arr[selected_num].best_class = best_class;
++selected_num;
}
}
if (selected_detections_num)
*selected_detections_num = selected_num;
return result_arr;
}
// compare to sort detection** by bbox.x
int compare_by_lefts(const void *a_ptr, const void *b_ptr) {
const detection_with_class* a = (detection_with_class*)a_ptr;
const detection_with_class* b = (detection_with_class*)b_ptr;
const float delta = (a->det.bbox.x - a->det.bbox.w/2) - (b->det.bbox.x - b->det.bbox.w/2);
return delta < 0 ? -1 : delta > 0 ? 1 : 0;
}
// compare to sort detection** by best_class probability
int compare_by_probs(const void *a_ptr, const void *b_ptr) {
const detection_with_class* a = (detection_with_class*)a_ptr;
const detection_with_class* b = (detection_with_class*)b_ptr;
float delta = a->det.prob[a->best_class] - b->det.prob[b->best_class];
return delta < 0 ? -1 : delta > 0 ? 1 : 0;
}
void draw_detections_v3(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes, int ext_output)
{
static int frame_id = 0;
frame_id++;
int selected_detections_num;
detection_with_class* selected_detections = get_actual_detections(dets, num, thresh, &selected_detections_num, names);
// text output
qsort(selected_detections, selected_detections_num, sizeof(*selected_detections), compare_by_lefts);
int i;
for (i = 0; i < selected_detections_num; ++i) {
const int best_class = selected_detections[i].best_class;
printf("%s: %.0f%%", names[best_class], selected_detections[i].det.prob[best_class] * 100);
if (ext_output)
printf("\t(left_x: %4.0f top_y: %4.0f width: %4.0f height: %4.0f)\n",
round((selected_detections[i].det.bbox.x - selected_detections[i].det.bbox.w / 2)*im.w),
round((selected_detections[i].det.bbox.y - selected_detections[i].det.bbox.h / 2)*im.h),
round(selected_detections[i].det.bbox.w*im.w), round(selected_detections[i].det.bbox.h*im.h));
else
printf("\n");
int j;
for (j = 0; j < classes; ++j) {
if (selected_detections[i].det.prob[j] > thresh && j != best_class) {
printf("%s: %.0f%%", names[j], selected_detections[i].det.prob[j] * 100);
if (ext_output)
printf("\t(left_x: %4.0f top_y: %4.0f width: %4.0f height: %4.0f)\n",
round((selected_detections[i].det.bbox.x - selected_detections[i].det.bbox.w / 2)*im.w),
round((selected_detections[i].det.bbox.y - selected_detections[i].det.bbox.h / 2)*im.h),
round(selected_detections[i].det.bbox.w*im.w), round(selected_detections[i].det.bbox.h*im.h));
else
printf("\n");
}
}
}
// image output
qsort(selected_detections, selected_detections_num, sizeof(*selected_detections), compare_by_probs);
for (i = 0; i < selected_detections_num; ++i) {
int width = im.h * .002;
if (width < 1)
width = 1;
/*
if(0){
width = pow(prob, 1./2.)*10+1;
alphabet = 0;
}
*/
//printf("%d %s: %.0f%%\n", i, names[selected_detections[i].best_class], prob*100);
int offset = selected_detections[i].best_class * 123457 % classes;
float red = get_color(2, offset, classes);
float green = get_color(1, offset, classes);
float blue = get_color(0, offset, classes);
float rgb[3];
//width = prob*20+2;
rgb[0] = red;
rgb[1] = green;
rgb[2] = blue;
box b = selected_detections[i].det.bbox;
//printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
int left = (b.x - b.w / 2.)*im.w;
int right = (b.x + b.w / 2.)*im.w;
int top = (b.y - b.h / 2.)*im.h;
int bot = (b.y + b.h / 2.)*im.h;
if (left < 0) left = 0;
if (right > im.w - 1) right = im.w - 1;
if (top < 0) top = 0;
if (bot > im.h - 1) bot = im.h - 1;
//int b_x_center = (left + right) / 2;
//int b_y_center = (top + bot) / 2;
//int b_width = right - left;
//int b_height = bot - top;
//sprintf(labelstr, "%d x %d - w: %d, h: %d", b_x_center, b_y_center, b_width, b_height);
// you should create directory: result_img
//static int copied_frame_id = -1;
//static image copy_img;
//if (copied_frame_id != frame_id) {
// copied_frame_id = frame_id;
// if (copy_img.data) free_image(copy_img);
// copy_img = copy_image(im);
//}
//image cropped_im = crop_image(copy_img, left, top, right - left, bot - top);
//static int img_id = 0;
//img_id++;
//char image_name[1024];
//int best_class_id = selected_detections[i].best_class;
//sprintf(image_name, "result_img/img_%d_%d_%d_%s.jpg", frame_id, img_id, best_class_id, names[best_class_id]);
//save_image(cropped_im, image_name);
//free_image(cropped_im);
if (im.c == 1) {
draw_box_width_bw(im, left, top, right, bot, width, 0.8); // 1 channel Black-White
}
else {
draw_box_width(im, left, top, right, bot, width, red, green, blue); // 3 channels RGB
}
if (alphabet) {
char labelstr[4096] = { 0 };
strcat(labelstr, names[selected_detections[i].best_class]);
char prob_str[10];
sprintf(prob_str, ": %.2f", selected_detections[i].det.prob[selected_detections[i].best_class]);
strcat(labelstr, prob_str);
int j;
for (j = 0; j < classes; ++j) {
if (selected_detections[i].det.prob[j] > thresh && j != selected_detections[i].best_class) {
strcat(labelstr, ", ");
strcat(labelstr, names[j]);
}
}
image label = get_label_v3(alphabet, labelstr, (im.h*.02));
//draw_label(im, top + width, left, label, rgb);
draw_weighted_label(im, top + width, left, label, rgb, 0.7);
free_image(label);
}
if (selected_detections[i].det.mask) {
image mask = float_to_image(14, 14, 1, selected_detections[i].det.mask);
image resized_mask = resize_image(mask, b.w*im.w, b.h*im.h);
image tmask = threshold_image(resized_mask, .5);
embed_image(tmask, im, left, top);
free_image(mask);
free_image(resized_mask);
free_image(tmask);
}
}
free(selected_detections);
}
void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes)
{
int i;
for(i = 0; i < num; ++i){
int class_id = max_index(probs[i], classes);
float prob = probs[i][class_id];
if(prob > thresh){
//// for comparison with OpenCV version of DNN Darknet Yolo v2
//printf("\n %f, %f, %f, %f, ", boxes[i].x, boxes[i].y, boxes[i].w, boxes[i].h);
// int k;
//for (k = 0; k < classes; ++k) {
// printf("%f, ", probs[i][k]);
//}
//printf("\n");
int width = im.h * .012;
if(0){
width = pow(prob, 1./2.)*10+1;
alphabet = 0;
}
int offset = class_id*123457 % classes;
float red = get_color(2,offset,classes);
float green = get_color(1,offset,classes);
float blue = get_color(0,offset,classes);
float rgb[3];
//width = prob*20+2;
rgb[0] = red;
rgb[1] = green;
rgb[2] = blue;
box b = boxes[i];
int left = (b.x-b.w/2.)*im.w;
int right = (b.x+b.w/2.)*im.w;
int top = (b.y-b.h/2.)*im.h;
int bot = (b.y+b.h/2.)*im.h;
if(left < 0) left = 0;
if(right > im.w-1) right = im.w-1;
if(top < 0) top = 0;
if(bot > im.h-1) bot = im.h-1;
printf("%s: %f%%", names[class_id], prob * 100);
//printf(" - id: %d, x_center: %d, y_center: %d, width: %d, height: %d",
// class_id, (right + left) / 2, (bot - top) / 2, right - left, bot - top);
printf("\n");
draw_box_width(im, left, top, right, bot, width, red, green, blue);
if (alphabet) {
image label = get_label(alphabet, names[class_id], (im.h*.03)/10);
draw_label(im, top + width, left, label, rgb);
}
}
}
}

It is a design issue, rather then a right/wrong answer. I would suggest when in
draw_detections_v3() when selected_detections_num > 0:
int selected_detections_num = 0 ;
detection_with_class* selected_detections = get_actual_detections(dets, num, thresh, &selected_detections_num, names);
// Your alarm code here (example)
if( selected_detections_num )
{
putchar( `\a` ) ; // Audible alert to console - replace with
// suitable audible alert if you need.
}

Related

Finding distance in a matrix in C

I have a matrix of numbers and want to find the distance of each item from its farthest non-zero neighbour (in four directions). I came up with this idea
#include <stdlib.h>
#include <stdio.h>
int min(int a, int b, int c, int d)
{
int e = a < b ? a : b;
int f = c < d ? c : d;
int r = e < f ? e : f;
return r;
}
int main()
{
int width = 50;
int height = 50;
int points[width][height];
int distances[width][height][5]; // 0 left 1 right 2 bottom 3 top 4 min
// adding some random values, zero and non-zero
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x++)
{
points[x][y] = rand() % 100;
}
}
// scanning in four direction to check if the previous neighbour exists
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x++)
{
if (points[x][y] > 0)
{
distances[x][y][0] = distances[x - 1][y][0] > 0 ? distances[x - 1][y][0] + 1 : 1;
}
}
for (int x = width - 1; x >= 0; x--)
{
if (points[x][y] > 0)
{
distances[x][y][1] = distances[x + 1][y][1] > 0 ? distances[x + 1][y][1] + 1 : 1;
}
}
}
for (int x = 0; x < width; x++)
{
for (int y = 0; y < height; y++)
{
if (points[x][y] > 0)
{
distances[x][y][2] = distances[x][y - 1][2] > 0 ? distances[x][y - 1][2] + 1 : 1;
}
}
for (int y = height - 1; y >= 0; y--)
{
if (points[x][y] > 0)
{
distances[x][y][3] = distances[x][y + 1][3] > 0 ? distances[x][y + 1][3] + 1 : 1;
}
}
}
// finding the minimum of four distances
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x++)
{
if (points[x][y] > 0)
{
distances[x][y][4] = min(distances[x][y][0], distances[x][y][1], distances[x][y][2], distances[x][y][3]);
printf("%d %d %d %d %d %d %d \n", x, y, distances[x][y][0], distances[x][y][1], distances[x][y][2], distances[x][y][3], distances[x][y][4]);
}
}
}
return 0;
}
but it doesn't work as expected. Most likely, I have made a stupid mistake and have a blind eye for that to see.
At line 34:
if (points[x][y] > 0)
{
distances[x][y][0] = distances[x - 1][y][0] > 0 ? distances[x - 1][y][0] + 1 : 1;
}
when x is zero, you are referencing up to 250 (50 * 5) words before the address of distances, which is an invalid thing to do.

C Program runs in repl it compiler but not on gcc (seg fault) Cant find it

Apologies for the horrible indentation. For whatever reason it wont paste with it.
As in the title Im getting a segmentation error pointing to both main and ReadData funtion. However it doesnt say in which line. Ive tried multiple changes and it end sup in the same.
edit: gdb gives me:at vfscanf.c:1898 1898 vfscanf.c: No such file or directory.
As input Im using in.txt with:
-1 1 5 14
3 1
-6 -2
-4 2
4 -4
2 4
-1 3
2 2
0 -2
-4 -2
-6 6
4 4
-2 4
2 -2
-4 6
C CODE:
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
/* point structure to store x & y co-ordinates */
typedef struct {
int x, y;
}
point;
/* Prototype declarations*/
point * ReadData(point * center, int * radius, int * nPoints);
point * FilterData(point * data, point center, int radius, int * nPoints);
void Merge(point * data, int p, int q, int r);
void MergeSort(point * data, int p, int r);
void BinarySearch(point * data, int nPoints, point p);
void SearchPhase(point * data, int nPoints);
int main() {
point * data, * filter;
point center;
FILE * fp = fopen("out.txt", "w"); // output file out.txt
int radius, nPoints, i;
data = ReadData( & center, & radius, & nPoints); //call of ReadData()
filter = FilterData(data, center, radius, & nPoints); // call of FilterData()
MergeSort(filter, 0, nPoints - 1); // call of MersgeSort()
for (i = 0; i < nPoints; i++)
fprintf(fp, "%d\t%d\n", filter[i].x, filter[i].y); // writing to output file
fclose(fp);
printf("Filtered and sorted data written to out.txt");
SearchPhase(filter, nPoints); // call of SearchPhase()
return 0;
}
point * ReadData(point * center, int * radius, int * nPoints) {
FILE * fp = fopen("in.txt", "r"); // input file in.txt
point * ptr;
int i, n;
/* reading of center radius and no. of poins from file*/
fscanf(fp, "%d%d%d%d", & center - > x, & center - > y, radius, nPoints);
n = * nPoints;
ptr = (point * ) malloc(sizeof(point) * n); // dynamic memory allocation
for (i = 0; i < n; i++)
fscanf(fp, "%d%d", & ptr[i].x, & ptr[i].y); // reading of x and y from file
fclose(fp);
return ptr;
}
point * FilterData(point * data, point center, int radius, int * nPoints) {
point * filter;
int n = * nPoints, i, j, pos, x, y;
double dist;
int * a = (int * ) calloc(sizeof(int), n); // dynamic memory allocation
for (i = 0, j = 0; i < n; i++) {
x = data[i].x;
y = data[i].y;
dist = sqrt(pow(center.x - x, 2) + pow(center.y - y, 2)); // distance between center and point
if (dist <= (double) radius) // if distance <= radius then point is within circle
{
a[j] = i;
j++;
}
}
* nPoints = j;
filter = (point * ) malloc(sizeof(point) * j); // dynamic memory allocation
for (i = 0; i < j; i++) {
pos = a[i];
filter[i] = data[pos];
}
return filter;
}
/* Merge()*/
void Merge(point * data, int p, int q, int r) {
int n1 = q - p + 1;
int n2 = r - q;
point L[n1 + 1], R[n2 + 1];
int i, j, k;
for (i = 0; i < n1; i++)
L[i] = data[p + i];
for (j = 0; j < n2; j++)
R[j] = data[q + 1 + j];
point sentinel;
sentinel.x = 999;
sentinel.y = 999;
L[n1] = sentinel;
R[n2] = sentinel;
i = 0, j = 0;
for (k = p; k <= r; k++) {
if (L[i].x < R[j].x || (L[i].x == R[j].x && L[i].y < R[j].y)) {
data[k] = L[i];
i++;
} else {
data[k] = R[j];
j++;
}
}
}
/* MergeSort()*/
void MergeSort(point * data, int p, int r) {
int q, i;
if (p < r) {
q = (p + r) / 2;
MergeSort(data, p, q);
MergeSort(data, q + 1, r);
Merge(data, p, q, r);
}
}
/* BinarySearch()*/
void BinarySearch(point * data, int nPoints, point p) {
int l = 0, u = nPoints - 1, m, flag = 0;
while (l <= u) {
m = (l + u) / 2;
if (data[m].x == p.x && data[m].y == p.y) {
flag = 1;
break;
} else if (p.x < data[m].x || (p.x == data[m].x && p.y < data[m].y))
u = m - 1;
else
l = m + 1;
}
if (flag)
printf("\nOutput: Found at record %d", m + 1);
else
printf("\nOutput: Not Found");
}
/*SearchPhase()*/
void SearchPhase(point * data, int nPoints) {
point p;
while (1) {
printf("\nSearch input ( x y): ");
scanf("%d%d", & p.x, & p.y);
if (p.x == -999 || p.y == -999) {
printf("\nOutput: Exit\n");
break;
}
BinarySearch(data, nPoints, p);
}
}

How to optimize my c code?

I tried to implement C code for Wavelet transform in FPGA (Zynq ZC 702) but the code get stuck and this is because of memory problem so I should optimize my code but I don't know how.
Can anyone please give me some ideas how to do that ?
This is the main of the code
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "wavemin.h"
#include "waveaux.h"
#include "waveaux.c"
#include "wavemin.c"
int main() {
printf("Hello World1 \n\r");
wave_object obj;
wt_object wt;
float *inp, *out;
int N, i, J,k;
float temp[1280] = {};
char *name = "db4";
obj = wave_init(name);
printf("Hello World2 \n\r");
N = 1280;
inp = (float*)malloc(sizeof(float) * N);
out = (float*)malloc(sizeof(float) * N);
//wmean = mean(temp, N);
for (i = 0; i < N; ++i) {
inp[i] = temp[i];
printf("Hello World3 \n\r");
//printf("%g \n", inp[i]);
}
J = 4; //Decomposition Levels
wt = wt_init(obj, "dwt", N, J); // Initialize the wavelet transform object
printf("Hello World4 \n\r");
setDWTExtension(wt, "sym"); // Options are "per" and "sym". Symmetric is the default option
printf("Hello World5 \n\r");
setWTConv(wt, "direct");
printf("Hello World6 \n\r");
dwt(wt, inp); // Perform DWT
printf("Hello World7 \n\r");
//getDWTAppx(wt, out, wt->length[0]);
// printf("Approximation Coefficients Level 1 \n");
// for (i = 0; i < wt->length[0]; ++i) {
// printf("%g ", out[i]);
// }
// printf("\n\n");
for (k = 1; k <= J; ++k) {
getDWTDetail(wt, out, wt->length[k], k);
printf("Detail Coefficients Level %d Length %d \n",
k, wt - length[k]);
for (i = 0; i < wt->length[k]; ++i) {
printf("%g ", out[i]);
}
printf("\n\n");
}
wt_summary(wt);// Prints the full summary.
printf("Hello World8 \n\r");
wave_free(obj);
wt_free(wt);
free(inp);
free(out);
return 0;
}
The other part of the code where there is the function used in the main function:
#include "wavemin.h"
wave_object wave_init(char *wname) {
wave_object obj = NULL;
int retval;
retval = 0;
if (wname != NULL) {
retval = filtlength(wname);
}
obj = (wave_object)malloc(sizeof(struct wave_set) + sizeof(float) * 4 *
retval);
obj->filtlength = retval;
obj->lpd_len = obj->hpd_len = obj->lpr_len = obj->hpr_len = obj->filtlength;
strcpy(obj->wname, wname);
if (wname != NULL) {
filtcoef(wname, obj->params, obj->params + retval, obj->params + 2 *
retval, obj->params + 3 * retval);
}
obj->lpd = &obj->params[0];
obj->hpd = &obj->params[retval];
obj->lpr = &obj->params[2 * retval];
obj->hpr = &obj->params[3 * retval];
return obj;
}
wt_object wt_init(wave_object wave, char *method, int siglength, int J) {
int size, i, MaxIter;
wt_object obj = NULL;
size = wave->filtlength;
MaxIter = wmaxiter(siglength, size);
if (!strcmp(method, "dwt") || !strcmp(method, "DWT")) {
obj = (wt_object)malloc(sizeof(struct wt_set) + sizeof(float) *
(siglength + 2 * J * (size + 1)));
obj->outlength = siglength + 2 * J * (size + 1); // Default
strcpy(obj->ext, "sym"); // Default
}
obj->wave = wave;
obj->siglength = siglength;
obj->J = J;
obj->MaxIter = MaxIter;
strcpy(obj->method, method);
if (siglength % 2 == 0) {
obj->even = 1;
}
else {
obj->even = 0;
}
strcpy(obj->cmethod, "direct"); // Default
obj->cfftset = 0;
obj->lenlength = J + 2;
obj->output = &obj->params[0];
if (!strcmp(method, "dwt") || !strcmp(method, "DWT")) {
for (i = 0; i < siglength + 2 * J * (size + 1); ++i) {
obj->params[i] = 0.0;
}
}
//wave_summary(obj->wave);
return obj;
}
static void dwt_sym(wt_object wt, float *inp, int N, float *cA, int len_cA,
float *cD, int len_cD) {
int i, l, t, len_avg;
len_avg = wt->wave->lpd_len;
for (i = 0; i < len_cA; ++i) {
t = 2 * i + 1;
cA[i] = 0.0;
cD[i] = 0.0;
for (l = 0; l < len_avg; ++l) {
if ((t - l) >= 0 && (t - l) < N) {
cA[i] += wt->wave->lpd[l] * inp[t - l];
cD[i] += wt->wave->hpd[l] * inp[t - l];
printf("world1 \n\r");
}
else if ((t - l) < 0) {
cA[i] += wt->wave->lpd[l] * inp[-t + l - 1];
cD[i] += wt->wave->hpd[l] * inp[-t + l - 1];
printf("world2 \n\r");
}
else if ((t - l) >= N) {
cA[i] += wt->wave->lpd[l] * inp[2 * N - t + l - 1];
cD[i] += wt->wave->hpd[l] * inp[2 * N - t + l - 1];
printf("world3 \n\r");
}
}
}
}
void dwt(wt_object wt, float *inp) {
int i, J, temp_len, iter, N, lp;
int len_cA;
float *orig, *orig2;
temp_len = wt->siglength;
J = wt->J;
wt->length[J + 1] = temp_len;
wt->outlength = 0;
wt->zpad = 0;
orig = (float*)malloc(sizeof(float) * temp_len);
orig2 = (float*)malloc(sizeof(float) * temp_len);
for (i = 0; i < wt->siglength; ++i) {
orig[i] = inp[i];
printf("Hello1 \n\r");
}
if (wt->zpad == 1) {
orig[temp_len - 1] = orig[temp_len - 2];
printf("Hello2 \n\r");
}
N = temp_len;
lp = wt->wave->lpd_len;
if (!strcmp(wt->ext, "sym")) {
//printf("\n YES %s \n", wt->ext);
i = J;
while (i > 0) {
N = N + lp - 2;
N = (int)ceil((float)N / 2.0);
wt->length[i] = N;
wt->outlength += wt->length[i];
i--;
}
wt->length[0] = wt->length[1];
wt->outlength += wt->length[0];
N = wt->outlength;
printf("Hello3 \n\r");
for (iter = 0; iter < J; ++iter) {
len_cA = wt->length[J - iter];
N -= len_cA;
dwt_sym(wt, orig, temp_len, orig2, len_cA, wt->params + N, len_cA);
temp_len = wt->length[J - iter];
printf("Hello4 \n\r");
if (iter == J - 1) {
for (i = 0; i < len_cA; ++i) {
wt->params[i] = orig2[i];
printf("Hello5 \n\r");
}
} else {
for (i = 0; i < len_cA; ++i) {
orig[i] = orig2[i];
printf("Hello6 \n\r");
}
}
}
} else {
printf("Signal extension can be either per or sym");
exit(-1);
}
free(orig);
free(orig2);
}
void setDWTExtension(wt_object wt, char *extension) {
if (!strcmp(extension, "sym")) {
strcpy(wt->ext, "sym");
} else {
printf("Signal extension can be either per or sym");
exit(-1);
}
}
void setWTConv(wt_object wt, char *cmethod) {
if (!strcmp(cmethod, "direct")) {
strcpy(wt->cmethod, "direct");
}
}
void getDWTDetail(wt_object wt, float *detail, int N, int level) {
/*
returns Detail coefficents at the jth level where j = 1,2,.., J
and Wavelet decomposition is stored as
[A(J) D(J) D(J-1) ..... D(1)] in wt->output vector
Use getDWTAppx() to get A(J)
Level 1 : Length of D(J), ie N, is stored in wt->length[1]
Level 2 :Length of D(J-1), ie N, is stored in wt->length[2]
....
Level J : Length of D(1), ie N, is stored in wt->length[J]
*/
int i, iter, J;
J = wt->J;
if (level > J) {
printf("The decomposition only has %d levels", J);
}
iter = wt->length[0];
for (i = 1; i < level; ++i) {
iter += wt->length[i];
}
for (i = 0; i < N; ++i) {
detail[i] = wt->output[i + iter];
}
}
void getDWTAppx(wt_object wt, float *appx, int N) {
/*
Wavelet decomposition is stored as
[A(J) D(J) D(J-1) ..... D(1)] in wt->output vector
Length of A(J) , N = wt->length[0]
*/
int i;
for (i = 0; i < N; ++i) {
appx[i] = wt->output[i];
}
}
void wt_summary(wt_object wt) {
int i;
int J, t;
J = wt->J;
printf("Wavelet Coefficients are contained in vector : %s \n", "output");
printf("\n");
printf("Approximation Coefficients \n");
printf("Level %d Access : output[%d] Length : %d \n",
1, 0, wt->length[0]);
printf("\n");
printf("Detail Coefficients \n");
t = wt->length[0];
for (i = 0; i < J; ++i) {
printf("Level %d Access : output[%d] Length : %d \n",
i + 1, t, wt->length[i + 1]);
t += wt->length[i + 1];
}
printf("\n");
}
void wave_free(wave_object object) {
free(object);
}
void wt_free(wt_object object) {
free(object);
}
enter image description here
In your code
Always check if malloc has returned non NULL value
Check your stack and heap settings in the linker file as you declare massive local variables and do a lots of mallocs - I suspect the (nomen omen)stack overflow, or failed mallocs.
Is it a bare metal program or you run it under some kind of OS?
Just for a matter of style and concision, I would rewrite this:
if (siglength % 2 == 0) {
obj->even = 1;
}
else {
obj->even = 0;
}
Into the following code:
obj->even = !(siglength % 2);
Or, alternatively:
obj->even = (siglength % 2) ? 0 : 1;
Also, I think there is room for optimization in this function:
static void dwt_sym(wt_object wt, float *inp, int N, float *cA, int len_cA,
float *cD, int len_cD) {
int i, l, t, len_avg;
len_avg = wt->wave->lpd_len;
for (i = 0; i < len_cA; ++i) {
t = 2 * i + 1;
cA[i] = 0.0;
cD[i] = 0.0;
for (l = 0; l < len_avg; ++l) {
if ((t - l) >= 0 && (t - l) < N) {
cA[i] += wt->wave->lpd[l] * inp[t - l];
cD[i] += wt->wave->hpd[l] * inp[t - l];
printf("world1 \n\r");
}
else if ((t - l) < 0) {
cA[i] += wt->wave->lpd[l] * inp[-t + l - 1];
cD[i] += wt->wave->hpd[l] * inp[-t + l - 1];
printf("world2 \n\r");
}
else if ((t - l) >= N) {
cA[i] += wt->wave->lpd[l] * inp[2 * N - t + l - 1];
cD[i] += wt->wave->hpd[l] * inp[2 * N - t + l - 1];
printf("world3 \n\r");
}
}
}
}
First, you are always referring to t - 1 and never t itself, so why not have:
t = 2 * i;
And, I can guess that a lot of computation can be placed outside of the inner loop... If you want to optimize, there are many good candidate here.
One last word about optimization!
You should first profile your software and see where you spend the most time before thinking about optimization. You cannot optimize "in the air" without knowing where your software does really struggle. Consider using gprof.
PS: You should never ever use the letter l (ell) as a variable... it is way to close from the number 1 (one). Consider changing this is also, it can improve the reading.

Simple C example of add/sub/mul/div operations in double-precision floating-points using a single-precision Floating-point system

I am working on an algorithm which requires calculations in large numbers, upto e+30. I am using a 32 bit system with compiler support of 32 bits for long/float/double. So far, by searching online, I've learned that single-precision floating points (FPs) can be used for Double-precision FPs.
From this question asked by someone earlier (Emulate “double” using 2 “float”s) I found this paper which has the algorithm to work with Double-precision FPs in GPUs. It is too confusing for me to implement in C. I just need four basic mathematical operations. Is there any way I could find an example for this which will help me understand it better?
Thanks in advance.
Here is the Code I am working on. It might have errors i can not see, any suggestions would be appreciated to rectify error but that is preety much what I am trying to implement. In the algorithm, POLYNOMIAL_ORDER should be able to go up to forth order (can settle at Third order if the standard deviation is smaller). Few things I am not sure about are 1) Procedures make_float() and make_float() are correct or not, 2) Use of make_float() in the program.
#define POLYNOMIAL_ORDER (3)
#define TC_TABLE_SIZE (14)
typedef struct vector_float2{
float x;
float y;
}float2;
typedef struct
{
float tc0;
float tc1;
float tc2;
float tc3;
}POLYNOMIALS;
typedef struct {
int16_t Temp;
int16_t Comp;
} TempCompPair;
volatile TempCompPair TCtable[TC_TABLE_SIZE] = {{22452,1651},
{25318,1444},
{28268,1133},
{31120,822},
{34027,511},
{36932,185},
{39770,-81},
{42685,-288},
{45531,-407},
{48425,-632},
{51401,-703},
{54460,-1143},
{57202,-1420},
{60027,-1652}};
POLYNOMIALS polynomials;
float matrix[TC_TABLE_SIZE][TC_TABLE_SIZE] = {0};
float average[TC_TABLE_SIZE] = {0};
float make_float(float x, float y)
{
return x+y;
}
float2 make_float2(float a, float b)
{
float2 f2 = {a,b};
return f2;
}
float2 quickTwoSum(float a, float b)
{
float s = a+b;
float e = b - (s - a);
float2 result = {s, e};
return result;
}
float2 twoSum(float a, float b)
{
volatile float s = a + b;
float v = s - a;
float e = (a - (s - v)) + (b - v);
float2 result = {s , e};
return result;
}
float2 df64_add(float2 a, float2 b)
{
float2 s,t;
s = twoSum(a.x, b.x);
t = twoSum(a.y, b.y);
s.y += t.x;
s = quickTwoSum(s.x, s.y);
s.y += t.y;
s = quickTwoSum(s.x, s.y);
return s;
}
float2 split(float a)
{
const float split = 4097; //(1<<12) + 1
float t = a *split;
float a_hi = t - (t - a);
float a_lo = a - a_hi;
float2 result = {a_hi, a_lo};
return result;
}
float2 twoProd(float a, float b)
{
float p = a*b;
float2 aS = split(a);
float2 bS = split(b);
float err = ((aS.x * bS.x - p)
+ aS.x * bS.y + aS.y * bS.x)
+ aS.y * bS.y;
float2 result = {p, err};
return result;
}
float2 df64_mult(float2 a, float2 b)
{
float2 p;
p = twoProd(a.x,b.x);
p.y += a.x * b.y;
p.y += a.y * b.x;
p = quickTwoSum(p.x,p.y);
return p;
}
float2 calculate_power(float base, int pow)
{
int i = 0;
float2 base_f2 = make_float2(base,0);
float2 result_f2 = {1,0};
if(pow == 0)
{
return result_f2;
}
if(pow > 0)
{
if(pow == 1)
{
return base_f2;
}
else
{
for(i = 0; i < pow; i++)
{
result_f2 = df64_mult(result_f2,base_f2);
}
return result_f2;
}
}
else
{
return result_f2;
//Mechanism for negative powers
}
}
void TComp_Polynomial()
{
int i;
int j;
int k;
int size;
float temp;
float2 sum = {0,0};
float2 result0 = {0,0};
float2 result1 = {0,0};
float x[TC_TABLE_SIZE];
float y[TC_TABLE_SIZE];
for(i = 0; i < TC_TABLE_SIZE; i++)
{
x[i] = (float) TCtable[i].Temp;
y[i] = (float) TCtable[i].Comp;
}
size = i;
for(i = 0; i <= POLYNOMIAL_ORDER; i++)
{
for(j = 0; j <= POLYNOMIAL_ORDER; j++)
{
sum.x = 0;
sum.y = 0;
for(k = 0; k < size; k++)
{
// Expression simplified below: **sum += pow(x[k],i+j)**
result0 = calculate_power(x[k], i+j);
sum = df64_add(result0,sum);
}
matrix[i][j] = make_float(sum.x,sum.y);
}
}
for(i = 0; i <= POLYNOMIAL_ORDER; i++)
{
sum.x = 0;
sum.y = 0;
for(j = 0; j < size; j++)
{
// Expression simplified below: **sum += y[j] * pow(x[j],i)**
result0 = calculate_power(x[j], i);
result1 = df64_mult( result0 , make_float2(y[j],0) );
sum = df64_add(result1,sum);
}
average[i] = make_float(sum.x,sum.y);
}
for(i = 0; i <= POLYNOMIAL_ORDER; i++)
{
for(j = 0; j <= POLYNOMIAL_ORDER; j++)
{
if(j != i)
{
if(matrix[i][i]!= 0)
{
temp = matrix[j][i]/matrix[i][i];
}
for(k = i; k < POLYNOMIAL_ORDER; k++)
{
matrix[j][k] -= temp*matrix[i][k];
}
average[j] -= temp*average[i];
}
}
}
if(matrix[0][0] != 0)
{
polynomials.tc0 = average[0]/matrix[0][0];
}
if(matrix[1][1] != 0)
{
polynomials.tc1 = average[1]/matrix[1][1];
}
if(matrix[2][2] != 0)
{
polynomials.tc2 = average[2]/matrix[2][2];
}
if(matrix[3][3] != 0)
{
polynomials.tc3 = average[3]/matrix[3][3];
}
}
and then use the struct polynomials.tc0/1/2/3 in below expression
// Y = T^3 * X3 + T^2 * X2 + T^1 * X1 + X0 ;
double calculate_equation(uint16_t TEMP)
{
double Y;
if(POLYNOMIAL_ORDER == 1)
{
Y = polynomials.tc1*(double)TEMP + polynomials.tc0;
}
else if(POLYNOMIAL_ORDER == 2)
{
Y = (polynomials.tc2 * (double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0;
}
else if(POLYNOMIAL_ORDER == 3)
{
Y = ((polynomials.tc3 * (double)TEMP + polynomials.tc2)*(double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0;
}
else if(POLYNOMIAL_ORDER == 4)
{
Y = (((polynomials.tc4 * (double)TEMP + polynomials.tc3)*(double)TEMP + polynomials.tc2)*(double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0;
}
return Y;
}
And standard Deviation is calculated is as follows:
//sqrt(sigma(error^2))
for(i = 0; i < TC_TABLE_SIZE; i++)
{
actual_comp[i] =(int) calculate_equation(TCtable[i].Temp);
error[i] = TCtable[i].Comp - actual_comp[i] ;
error_sqr += error[i]*error[i];
printf("%u\t%d\t\t%e\n", TCtable[i].Temp, TCtable[i].Comp, actual_comp[i] );
}
error_sqrt = sqrt(error_sqr);
Reference:
http://hal.archives-ouvertes.fr/docs/00/06/33/56/PDF/float-float.pdf Guillaume Da Graça, David Defour Implementation of float-float operators on graphics hardware, 7th conference on Real Numbers and Computers, RNC7.
I was able to implement this code without using double precision as the calculations were in the range of Float.
Here's my implementation, let me know if I can optimize it better.
typedef struct
{ int64_t tc0;
int64_t tc1;
int64_t tc2;
int64_t tc3;
int64_t tc4;
}POLYNOMIALS;
POLYNOMIALS polynomials = {0,0,0,0,0};
int16_t TempCompIndex;
int64_t x[TC_TABLE_SIZE];
int64_t y[TC_TABLE_SIZE];
float matrix[POLYNOMIAL_ORDER+1][POLYNOMIAL_ORDER+1] = {0};
float average[POLYNOMIAL_ORDER+1] = {0};
void TComp_Polynomial()
{
int i;
int j;
int k;
int size;
float temp;
float sum = 0;
float powr = 0;
float prod;
int64_t x[TC_TABLE_SIZE];
int64_t y[TC_TABLE_SIZE];
for(i = 0; i < TC_TABLE_SIZE; i++)
{
x[i] = (int64_t) TCtable[i].Temp;
y[i] = (int64_t) TCtable[i].Comp<<PRECISION;
printf("x: %lld, y:%lld\n",x[i],y[i]);
}
size = i;
for(i = 0; i <= POLYNOMIAL_ORDER; i++)
{
for(j = 0; j <= POLYNOMIAL_ORDER; j++)
{
sum = 0;
powr = 0;
for(k = 0; k < size; k++)
{
//printf("x[%d]: %ld, i: %d ,j: %d ", k, x[k],i,j);
powr = pow(x[k],i+j);
//printf("Power: %f, sum: %f\n ",powr,sum);
sum += powr;
//printf("%f\r\n",powr);
//printf("sum: %lf\n",sum );
}
matrix[i][j] = sum;
printf("sum: %g\n",sum);
}
}
for(i = 0; i <= POLYNOMIAL_ORDER; i++)
{
sum = 0;
powr = 0;
for(j = 0; j < size; j++)
{
//sum += y[j] * pow(x[j],i)
//printf("sum: %lf, y[%d]: %lf, x[%d]: %lf^%d ",sum,j,y[j], i, x[j],i);
//printf("x[%d]:%lld ^ %d\t",j,x[j],i);
powr = (float) pow(x[j],i);
printf("powr: %f\t",powr);
prod = (float) y[j] * powr;
printf("prod:%f \t %lld \t", prod,y[j]);
sum += (float) prod;
printf("sum: %f \n",sum);
}
average[i] = sum;
//printf("#Avg: %f\n",average[i]);
}
printf("\n\n");
for(i = 0; i <= POLYNOMIAL_ORDER; i++)
{
for(j = 0; j <= POLYNOMIAL_ORDER; j++)
{
if(j != i)
{
if(matrix[i][i]!= 0)
{
//printf("matrix%d%d: %g / matrix%d%d: %g =\t ",j,i,matrix[j][i],i,i,matrix[i][i]);
temp = matrix[j][i]/matrix[i][i];
//printf("Temp: %g\n",temp);
}
for(k = i; k < POLYNOMIAL_ORDER; k++)
{
matrix[j][k] -= temp*matrix[i][k];
//printf("matrix[%d][%d]:%g, %g, matrix[%d][%d]:%g\n",j,k,matrix[j][k], temp,i,k,matrix[i][k]);
}
//printf("\n\n");
//print_matrix();
printf("\n\n");
//printf("avg%d: %g\ttemp: %g\tavg%d: %g\n\n",j,average[j],temp,i,average[i]);
average[j] -= temp*average[i];
printf("#Avg%d:%g\n",j,average[j]);
//print_average();
}
}
}
print_matrix();
print_average();
/* Calculate polynomial Coefficients (n+1) based on the POLYNOMIAL_ORDER (n) */
#ifndef POLYNOMIAL_ORDER
#elif POLYNOMIAL_ORDER == 0
if(matrix[0][0] != 0)
{
polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
}
#elif POLYNOMIAL_ORDER == 1
if(matrix[1][1] != 0)
{
polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
}
#elif POLYNOMIAL_ORDER == 2
if(matrix[2][2] != 0)
{
polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
}
#elif POLYNOMIAL_ORDER == 3
if(matrix[3][3] != 0)
{
polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
}
#elif POLYNOMIAL_ORDER == 4
if(matrix[4][4] != 0)
{
polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
polynomials.tc4 = (int64_t) (average[4]/matrix[4][4]);
}
#endif
}
int16_t calculate_equation(uint16_t TEMP)
{
int64_t Y = 0;
int16_t TempComp = 0;
#ifndef POLYNOMIAL_ORDER
#elif POLYNOMIAL_ORDER == 0
Y = polynomials.tc0;
#elif POLYNOMIAL_ORDER == 1
Y = polynomials.tc1* ((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 2
Y = (polynomials.tc2 * ((int64_t)TEMP) + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 3
Y = ((polynomials.tc3 * ((int64_t)TEMP) + polynomials.tc2)*((int64_t)TEMP) + polynomials.tc1)*((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 4
Y = (((polynomials.tc4 * (int64_t)TEMP + polynomials.tc3)*(int64_t)TEMP + polynomials.tc2)*(int64_t)TEMP + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#endif
TempComp = (int16_t) (Y>>PRECISION_BITS);
return TempComp;
}
void main(){
int16_t TempComp = 0;
TempCompValue = (int16_t) calculate_equation(Mon_Temp);
}
Note: Calculate_Equation() is being called once a second and it is required to not use float in order to avoid floating point arithmetic, hence I am using non-float variables in that function.
It is working right for me and haven't discovered any bug after initial testing.
Thanks every one for taking interest in my post, if not the answer, got to learn some new techniques. And thanks #chux.

How can I improve locality of reads and writes in the following code?

I'm working on the following image convolution code:
typedef struct fmatrix{
int rows;
int cols;
float** array;
} fmatrix;
typedef struct image{
unsigned char* data;
int w;
int h;
int c;
} image;
typedef struct kernel{
fmatrix* psf;
int divisor;
} kernel;
void convolve_sq(image* src, image* dst, kernel* psf, int pixel){
int size = psf->psf->rows * psf->psf->cols;
float tmp[size];
int n, m; //for psf
int x, y, x0, y0, cur; //for image
y0 = pixel / (src->w * src->c);
x0 = (pixel / src->c) % src->w;
for (n = 0; n < psf->psf->rows; ++n){
for (m = 0; m < psf->psf->cols; ++m){
y = n - (psf->psf->rows / 2);
x = m - (psf->psf->cols / 2);
if ((y + y0) < 0 || (y + y0) >= src->h || (x + x0) < 0 || (x + x0) >= src->w){
tmp[n*psf->psf->rows+m] = 255 * psf->psf->array[n][m];
}
else{
cur = (pixel + y * src->w * src->c + x * src->c);
tmp[n*psf->psf->rows+m] = src->data[cur] * psf->psf->array[n][m]; //misses on read
}
}
}
m = 0;
for (n = 0; n < size; ++n){
m += (int) tmp[n];
}
m /= psf->divisor;
if (m < 0) m = 0;
if (m > 255) m = 255;
dst->data[pixel] = m; //misses on write
}
void convolve_image(image* src, image* dst, kernel* psf){
int i, j, k;
for (i = 0; i < src->h; ++i){
for (j = 0; j < src->w; ++j){
for (k = 0; k < src->c; ++k){
convolve_sq(src, dst, psf, (i * src->w * src->c + j * src->c + k) );
}
}
}
}
Running cachegrind, I've determined two places where there are a substantial number of cache misses, which I've annotated in the code above. For the line marked "misses on read", there were 97,205 D1mr and 97,201 DLmr. For the line marked "misses on write", there were 97,201 D1mw and DLmw. These lines read and write directly to/from the image respectively.
How can I make this code more efficient, in terms of avoiding cache misses?

Resources