CUDA code not processing if block properly - c

Stuck at if block right below //step 5, the issue is that the code will not progress into or after the given if block. I need to figure out how to get this particular issue settled before starting the task of generating parallel code. If you run the code you will see one print statement that indicates the value of "one" and another two for "i" and "j". After the if block begins, none of the other print statements are hit. As a result I am quite stuck, I am aware that this is a specific issue, however, I cannot seem to determine it's cause.
Any help is appreciated!
Thanks in advance!
Input file sample.
>386.fasta.screen.Contig1
GAGTTTGATCCTGGCTCAGAATCAACGCTGGCGGCGCGCTTAACACATGC
AAGTCGAACGAGAAAGTGGAGCAATCCATGAGTACAGTGGCGTACGGGTG
AGTAACACGTGGGTAATCTACCTCTTAGTGGGGAATAACTTTGGGAAACC
GAAGCTAATACCGCATAAGCTCGAGAGAGGAAAGCAGCAATGCGCTGAGA
GAGGAGCCCGCGGCCGATTAGCTAGTTGGCAGGGTAAAAGCCTACCAAGG
CAGAGATCGGTAGCCGGCCTGAGAGGGCACACGGCCACACTGGCACTGAA
ACACGGGCCAGACTCCTACGGGAGGCAGCAGTGGGGAATCTTGCACAATG
GGGGCAACCCTGATGCAGCGACGCCGCGTGAGCGATGAAGCCCTTCGGGG
TGTAAAGCTCTTTCGTCAGGGAAGATAGTGACGGTACCTGGAGAAGCAGC
TGCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGCAGCGAGCGT
TGTTCGGAGTTACTGGGCGTAAAGGGTGTGTAGGCGGTTGTTTAAGTTTG
GTGTGAAATCTCCCGGCTCAACTGGGAGGGTGCGCCGAATACTGAGCGAC
TAGAGTGCGGGAGAGGAAAGTGGAATTCCTGGTGTAGCGGTGAAATGCGT
AGATATCAGGAGGAACACCGGTGGTGTAGACGGCTTTCTGGACCGTAACT
GACGCTGAGACACGAAAGCGTGGGTAGCAAACAGGATTAGATACCCTGGT
AGTCCACGCCCTAAACGATGCATATTTGGTGTGGGCAGTTCATTCTGTCC
GTGCCGGAGCTAACGCGTTAAATATGCCGCCTGGGGAGTACAGTCGCAAG
GCTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGT
GGTTTAATTCGACGCAACGCGAAGAACCTTACCTGGGCTCGAACGGCTTC
CCAACGCCGGTAGAAATATCGGTACCCCGCAAGGGGGTGGAATCGAGGTG
CTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGC
AACGAGCGCAACCCTTGTCCTGTGTTGCCATGCCGCAAGGCGGCACTCGC
AGGAGACCGCCAGCGATAAGCTGGAGGAAGGTGGGGATGACGTCAAGTCC
TCATGGCCTTTATGTCCAGGGCTACACACGTGCTACAATGGCCGGTACAA
AGCGTCGCTAACCTGCGAAGGGGAGCCAATCGCAAAAAACCGGTCTCAGT
TCGGATTGCAGGCTGCAACCCGCCTGCATGAAGCTGGAATCGCTAGTAAT
GGCAGATCAGCACGCTGCCGTGAATACGTTCCCGGGCCTTGTACACACAT
/********************************
Based on code by:
Lorenzo Seidenari (sixmoney#virgilio.it)
*********************************/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#define MAX_SEQUENCE_LENGTH 100000
int n;
int m;
int levenshtein_distance(char *s,char*t);
int minimum(int a,int b,int c);
//-----------------------------------------------------------------------------
void cleanString(char string[]) {
//Removes all spaces from string pointed to by "string", converts characters
//to uppercase, and deletes a terminating newline character.
int i, current;
int length = strlen(string);
current = 0;
for(i=0;i<length;i++) {
if(string[i]=='\n') {
string[current++] = '\0';
break;
}
else if(string[i]!=' ') {
string[current++] = toupper(string[i]);
}
}
}
//-----------------------------------------------------------------------------
int importFASTA(char *filename, char *sequence) {
//Reads a file, located at path specified by "filename", containing a FASTA
//sequence. It finds the first full, complete sequence in the file, stores
//it in "sequence", and returns the length of the sequence, or -1 on failure.
FILE *fastaFile;
char input[256];
int readFlag; //set to 1 once a sequence has been read in
int length;
//open the file
if((fastaFile = fopen(filename, "r")) == NULL) {
return -1;
}
sequence[0] = '\0';
//read the full first sequence, discarding unnecessary headers
readFlag=0;
length = 0;
while(fgets(input,256,fastaFile)!=NULL) {
//is it a header or a comment?
if(input[0]=='>' || input[0]==';') {
if(readFlag) break;
else continue;
}
else readFlag = 1;
cleanString(input);
length += strlen(input);
strncat(sequence,input,MAX_SEQUENCE_LENGTH-length - 1);
}
//Add a terminatng null character, just in case
sequence[length] = '\0';
fclose(fastaFile);
return length;
}
/****************************************/
/*Implementation of Levenshtein distance*/
/****************************************/
__global__ void levenshtein_distance(char *s,char*t, int one, int two)
/*Compute levenshtein distance between s and t*/
{
//Step 1
int k,i,j,cost,*d;
int distance = 0;
if(one!=0&&two!=0)
{
d=(int *)malloc((sizeof(int))*(two+1)*(one+1));
two++;
one++;
//Step 2
for(k=0;k<one;k++){
d[k]=k;
}
for(k=0;k<two;k++){
d[k*one]=k;
}
//Step 3 and 4
for(i=1;i<one;i++){
for(j=1;j<two;j++)
{
//Step 5
printf("%d %d %d\n", one, i, j);
if(s[i-1]==t[j-1]){
cost=0;
printf("%d %d %d\n", one, i, j);
}
else{
cost=1;
printf("%d %d %d\n", one, i, j);
}
printf("%d %d %d\n", one, i, j);
//Step 6
int min = d[(j-1)*one+i]+1;
if (d[j*one+i-1]+1 < min)
min = d[j*one+i-1]+1;
if (d[(j-1)*one+i-1]+cost < min)
min = d[(j-1)*one+i-1]+cost;
d[j*one+i] = min;
}
distance=d[one*two-1];
free(d);
printf("%d\n", distance);
}
}
else
printf ("-1");
}
int main(int argc, char *argv[]) {
char A[MAX_SEQUENCE_LENGTH+1];
char B[MAX_SEQUENCE_LENGTH+1];
if(argc < 3) {
printf("Usage: new_edit_distance <sequence1> <sequence2>\n");
printf("<sequence1>: file containing the first sequence, FASTA format\n");
printf("<sequence2>: file containing the second sequence, FASTA format\n");
return EXIT_FAILURE;
}
n = importFASTA(argv[1],A);
m = importFASTA(argv[2],B);
levenshtein_distance<<<1, 1>>>(A,B, n, m);
cudaDeviceSynchronize();
printf ("%s\n", cudaGetErrorString(cudaGetLastError()));
return EXIT_SUCCESS;
}

I get it now. You took straight serial C/C++ code, dropped it into a kernel, intended to run that kernel as a single thread, and then want to proceed from there.
The idea is plausible, but you're missing a key fact about CUDA and GPUs: they can't directly access host memory.
So when you set up A and B like this:
char A[MAX_SEQUENCE_LENGTH+1];
char B[MAX_SEQUENCE_LENGTH+1];
....
n = importFASTA(argv[1],A);
m = importFASTA(argv[2],B);
those are ordinary variables that live in host memory. GPU (ordinary CUDA) code can't directly access host memory. So when you pass those pointers to a kernel like this:
levenshtein_distance<<<1, 1>>>(A,B, n, m);
the GPU code will try and dereference those A and B pointers and will fault (unspecified launch failure).
Every CUDA program has the following basic sequence:
copy data to the GPU
perform computations on the GPU
copy results back
You've tried to do step 2 without step 1. It won't work.
Since I'm not able to run your program since I don't have valid input files, I'll make the following suggestion. I assume you know little or nothing about CUDA. Try adding lines like this:
n = importFASTA(argv[1],A); // no change
m = importFASTA(argv[2],B); // no change
char *d_A, *d_B; // add this line
cudaMalloc(&d_A, MAX_SEQUENCE_LENGTH+1); // add this line
cudaMalloc(&d_B, MAX_SEQUENCE_LENGTH+1); // add this line
cudaMemcpy(d_A, A, MAX_SEQUENCE_LENGTH+1, cudaMemcpyHostToDevice); // add
cudaMemcpy(d_B, B, MAX_SEQUENCE_LENGTH+1, cudaMemcpyHostToDevice); // add
levenshtein_distance<<<1, 1>>>(d_A,d_B, n, m); //modify parameters
n and m don't need to be handled any differently since you are passing those by value.
And add proper cuda error checking to your code.
EDIT: after some further analysis, it's clear that this sequence is not correct:
distance=d[one*two-1];
free(d);
printf("%d\n", distance);
}
}
You are freeing d on every iteration of the i loop. That cannot possibly be correct. I suggest you go back to square one and get your serial code working first, in ordinary serial C code, before dropping it into a cuda kernel this way. If you move that free statement outside the i loop, then your kernel runs for a very very long time. Be advised that in-kernel printf is limited in the amount of output that can be easily generated.
I'm not going to debug your code any further for you. Get your serial code working first, then figure out a way to create a kernel without massive quantities of printout.
A final comment: I said above your approach is "plausible". That it means it could be made to work, i.e produce the same behavior as the same code executing on the host. It does not mean it will run fast. This is not how you get acceleration out of a GPU (running a single block of a single thread). I assume you already know this based on your comment "how to get this particular issue settled before starting the task of generating parallel code." But I think the disclaimer is appropriate anyway.

Related

Buffer mutex and condition variables in C

I only just started writing multithreading in C and don't have a full understanding of how to implement it. I'm writing a code that reads an input file and puts into a buffer struct array. When the buffer has no more available space, request_t is blocked waiting for available space. It is controlled by thread Lift_R. The other threads lift 1-3 operate lift() and it writes whats in buffer to the output file depending number of int sec. sec and size and given values through command line. This will free up space for request to continue reading the input.
Can someone please help me with how to implement these functions properly. I know there are other questions relating to this, but I want my code to meet specific conditions.
(NOTE: lift operates in FIFO and threads use mutual exclusion)
This is what I wrote so far, I haven't implemented any waiting conditions or FIFO yet, I'm currently focusing on the writing to file and debugging and am soon getting to wait and signal.
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include "list.h"
pthread_cond_t cond1 = PTHREAD_COND_INITIALIZER; //declare thread conditions
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; //declare mutex
int sec; //time required by each lift to serve a request
int size; //buffer size
buffer_t A[];
write_t write;
void *lift(void *vargp)
{
pthread_mutex_lock(&lock);
FILE* out;
out = fopen("sim_output.txt", "w");
//gather information to print
if (write.p == NULL) //only for when system begins
{
write.p = A[1].from;
}
write.rf = A[1].from;
write.rt = A[1].to;
write.m = (write.p - A[1].from) + (A[1].to - A[1].from);
if (write.total_r == NULL) //for when the system first begins
{
write.total_r = 0;
}
else
{
write.total_r++;
}
if (write.total_m == NULL)
{
write.total_m = write.m;
}
else
{
write.total_m = write.total_m + write.m;
}
write.c = A[1].to;
//Now write the information
fprintf(out, "Previous position: Floor %d\n", write.p);
fprintf(out, "Request: Floor %d to Floor %d\n", write.rf, write.rt);
fprintf(out, "Detail operations:\n");
fprintf(out, " Go from Floor %d to Floor %d\n", write.p, write.rf);
fprintf(out, " Go from Floor %d to Floor %d\n", write.rf, write.rt);
fprintf(out, " #movement for this request: %d\n", write.m);
fprintf(out, " #request: %d\n", write.total_r);
fprintf(out, " Total #movement: %d\n", write.total_m);
fprintf(out, "Current Position: Floor %d\n", write.c);
write.p = write.c; //for next statement
pthread_mutex_unlock(&lock);
return NULL;
}
void *request_t(void *vargp)
{
pthread_mutex_lock(&lock); //Now only request can operate
FILE* f;
FILE* f2;
f = fopen("sim_input.txt", "r");
if (f == NULL)
{
printf("input file empty\n");
exit(EXIT_FAILURE);
}
f2 = fopen("sim_output.txt", "w");
int i = 0;
for (i; i < size; i++)
{
//read the input line by line and into the buffer
fscanf(f, "%d %d", &A[i].from, &A[i].to);\
//Print buffer information to sim_output
fprintf(f2, "----------------------------\n");
fprintf(f2, "New Lift Request from Floor %d to Floor %d \n", A[i].from, A[i].to);
fprintf(f2, "Request No %d \n", i);
fprintf(f2, "----------------------------\n");
}
printf("Buffer is full");
fclose(f);
fclose(f2);
pthread_mutex_unlock(&lock);
return NULL;
}
void main(int argc, char *argv[]) // to avoid segmentation fault
{
size = atoi(argv[0]);
if (!(size >= 1))
{
printf("buffer size too small\n");
exit(0);
}
else
{
A[size].from = NULL;
A[size].to = NULL;
}
sec = atoi(argv[1]);
pthread_t Lift_R, lift_1, lift_2, lift_3;
pthread_create(&Lift_R, NULL, request_t, NULL);
pthread_join(Lift_R, NULL);
pthread_create(&lift_1, NULL, lift, NULL);
pthread_join(lift_1, NULL);
pthread_create(&lift_2, NULL, lift, NULL);
pthread_join(lift_2, NULL);
pthread_create(&lift_3, NULL, lift, NULL);
pthread_join(lift_3, NULL);
}
And here is the struct files:
#include <stdbool.h>
typedef struct Buffer
{
int from;
int to;
}buffer_t; //buffer arrary to store from and to values from sim_input
typedef struct Output
{
int l; //lift number
int p; //previous floor
int rf; //request from
int rt; //request to
int total_m; //total movement
int c; // current position
int m; //movement
int total_r; //total requests made
}write_t;
Between reading your code and question I see a large conceptual gap. There are some technical problems in the code (eg. you never fclose out); and a hard to follow sequence.
So, this pattern:
pthread_create(&x, ?, func, arg);
pthread_join(x, ...);
Can be replaced with:
func(arg);
so, your really aren't multithreaded at all; it is exactly as if:
void main(int argc, char *argv[]) // to avoid segmentation fault
{
size = atoi(argv[0]);
if (!(size >= 1))
{
printf("buffer size too small\n");
exit(0);
}
else
{
A[size].from = NULL;
A[size].to = NULL;
}
sec = atoi(argv[1]);
request_t(0);
lift(0);
lift(0);
lift(0);
}
and, knowing that, I hope you can see the futility in:
pthread_mutex_lock(&lock);
....
pthread_mutex_unlock(&lock);
So, start with a bit of a rethink of what you are doing. It sounds like you have a lift device which needs to take inbound requests, perhaps sort them, then process them. Likely 'forever'.
This probably means a sorted queue; however one not sorted by an ordinary criteria. The lift traverses the building in both directions, but means to minimize changes in direction. This involves traversing the queue with both an order ( >, < ) and a current-direction.
You would likely want request to simply evaluate the lift graph, and determine where to insert the new request.
The lift graph would be a unidirectional list of where the lift goes next. And, perhaps a rule that the list only consults its list as it stops at a given floor.
So, the Request can take a lock of the graph, alter it to reflect the new requestion, then unlock it.
The Lift can simply:
while (!Lift_is_decommissioned) {
pthread_mutex_lock(&glock);
Destination = RemoveHead(&graph);
pthread_mutex_unlock(&glock);
GoTo(Destination);
}
And the Request can be:
pthread_mutex_lock(&glock);
NewDestination = NewEvent.floorpressed;
NewDirection = NewEvent.floorpressed > NewEvent.curfloor ? Up : Down;
i = FindInsertion(&graph, NewDestination, NewDirection);
InsertAt(&graph, i, NewDestination);
pthread_mutex_unlock(&glock);
Which may be a bit surprising that there is no difference between pressing a "goto floor" button from within the lift, and a "I want lift here now" from outside the lift.
But, with this sort of separation, you can have the lift simply follow the recipe above, and the handlers for the buttons invoke the other pseudo code above.
The FindInsertion() may be a bit hairy though....

Segmentation Fault with Merge Sort

I've recently begun brushing up on my C again after having not touched it for some time. So, I started by trying to implement merge sort but after looking at these lines for about an hour, I've failed to determine what is wrong.
int main(int argc,char** argv){
printf("test\n");
int x;
int input[] = {1,2,3};
int start=0;
int end=2;
for (x=0;x<5;x++){
//printf("%i\n",x);
}
printf("Run mergeSort in main \n");
mergeSort(input,start,end);
}
void mergeSort(int input[], int start, int end){
printf("Running merge %i %i",start,end);
int middle = (start + end)/2;
if (start < end){
mergeSort(input,start,middle);
mergeSort(input,middle+1,end);
merge(input,start,middle,end);
}
}
When I run this program, the first line "test" will print then "Run merge sort in main" but
printf("Running merge %i %i",start,end);
does not get executed which leaves me confused. I can't find any memory allocation issues or maybe I'm missing something very obvious.
Note that the merge function has been implemented on the side, but that portion of the code was not relevant to this problem directly so I did not include it.
EDIT Remaining Code:
void merge(int input[],int start, int middle, int end){
int save_start = start;
int temp[end-start+1];
int index = 0;
printf("Start merge");
while (start<=middle || middle+1<=end){
if (input[start]<input[middle]){
temp[index] = input[start];
start++;
} else {
temp[index] = input[middle];
middle++;
}
index++;
}
while (start<=middle){
temp[index] = input[start];
start++;
}
while (middle+1<=end){
temp[index] = input[middle];
middle++;
}
int i=0;
int a;
for (a=save_start;i<index;a++,i++){
input[a]=temp[i];
}
}
Most likely your output is buffered and is just waiting around to get printed. Add a \n to that string or call fflush(stdout).
As far as your segmentation fault is concerned, you haven't shown us enough code. Use a debugger to get a backtrace and you may find out some more information.
Edit: You have some array index bugs in your merge function. Notice that you're moving middle during the first loop there - what does that do to the loop condition? Hint: you need to stop at the actual middle position, not keep going once start gets past that point.
Edit 2: You have some off-by-one errors in there too.
Edit 3: Your || should be &&.
I believe your mistake is here: while (start<=middle || middle+1<=end).
start<=middle || middle+1<=end is true when one or more of these is true: start<=middle or middle+1<=end, causing your code to read and write out of bounds of your arrays in that loop.
start<=middle && middle+1<=end is true when all of these are true: start<=middle and middle+1<=end, causing your loop to break when it might otherwise read and write out of bounds of your array.
You probably mean while (start<=middle && middle+1<=end).

Embedding asm into C programming

I am trying to learn assembly my self, and I have been reading different websites first to know the meaning of some registers, if-the, etc, and saw examples on how to use them.
However I don't find it easy to understand. This program finds certain letters and counts them in a board using a bidimensional array. I want to replace the part of the functions void print_results(), and void count() with assembly code since this is very easy in regular C code.
I am not sure how to start so I am more interested on just a good start, specially on how to pass the variable from void read_board() to the function void count() to count the letters found, after that I think I can be on my own.
I appreciate any help, Thank you.
#include <stdio.h>
FILE *inputFilePtr;
char board[7][7];
void usage() {
printf("usage: one filename argument.\n");
}
void read_board() {
int i, j;
for (i=0; i != 7; i++) {
for (j=0; j != 7; j++) {
fscanf(inputFilePtr, "%c", &board[i][j]);
}
fscanf(inputFilePtr, "\n");
}
}
void count() {
__asm__("\
");
}
void print_results() {
}
int main(int argc, char**argv) {
if (argc != 2) {
usage();
return 1;
}
inputFilePtr = fopen(argv[1], "r");
if (inputFilePtr == NULL) {
printf("Couldn't open file, %s\n", argv[1]);
return 1;
}
read_board();
count();
print_results();
return 0;
}
If I get it right, you ask for calling conventions. Maybe this helps: http://en.wikipedia.org/wiki/Calling_convention
I am assuming you want the C-equivalent that does your current asm part.
Since you array board is global and your board size is fixed (7x7), you don't need pass anything to count(). This will do:
void count() {
int i, j, count = 0;
char = 'X'; //Replace with whatever char you want to count
for (i=0; i != 7; i++) {
for (j=0; j != 7; j++) {
if(board[i][j] == c)
count++;
}
}
}
Then simply call count() from wherever you want.
In case if you want to know how to pass parameters to functions (if the board, i & j are not global like your case):
call count as: count(&board[0][0], int i, int j);
Receive the parameters as: void count(char **board, int i, int j)
Not a direct (source) solution in asm, but here are some links you might, or might not have read:
GAS, x86?
X86 Assembly/GAS Syntax, this page has a very easy to read instruction by instruction. Click the X86_Assembly link at top to get the whole "book".
Writing Assembly-Language Functions ... GCC; Has a Examples section with some C-code <-> asm lists.
User guide to the gnu assembler as version 2.17
Using Assembly Language in Linux (http://asm.sourceforge.net), has some info on intel vs AT&T syntax. (as do wiki book above). Also look at the Syscalls with > 5 args. down a bit on the site.

Fastest way to print a certain number of characters to stdout in C

I have to print a certain number of blank spaces to stdout, but this number is not fixed. I'm using putchar(), but I'm not sure if this is fast. What is the fastest way to print a certain number of characters to stdout in C? Also, I cannot use system functions.
Thanks for you help!
I would just use fwrite. Simple. Correct. Easy.
void put_spaces(int n)
{
static const char SPACES[32] = " ";
for (; n >= 32; n -= 32)
fwrite(SPACES, 32, 1, stdout);
if (n)
fwrite(SPACES, n, 1, stdout);
}
Note, however, that the naive version is also quite fast:
void put_spaces(int n)
{
while (n--)
putchar(' ');
}
Why is it fast? On most systems, putchar is a macro which writes directly into a buffer most of the time. If you're not sure it's fast, the correct answer is profile your application, not "optimize first".
Stay away from malloc (it's just unnecessary), puts (which adds a '\n' every time you call it), and printf (it's too complicated for such a simple task).
I would try to use the system commands instead of making my own.
something like:
void print_spaces(unsigned int number_of_spaces) {
char* spaces = malloc(sizeof(char)*number_of_spaces + 1);
memset (spaces,' ',number_of_spaces);
spaces[number_of_spaces] = '\0';
printf("%s",spaces);
free(spaces);
}
would do the trick.
printf() allows you to adjust the number of spaces to be print, but this has to be stated in the format string. Se here for reference.
char format[256];
sprintf(format, "%%%ds", number_of_spaces); // creates the format string
printf(format, " ");
I'm assuming by "system functions", you mean non-standard extensions. In which case, it all depends on whether you mean fastest to write or fastest to execute?
If the former, and assuming there's an upper limit, you can just use something like:
void outSpaces (unsigned int num) {
static char *lotsaSpaces = " ";
printf ("%*.*s", num, num, lotsaSpaces);
}
If the latter, something like this should be a good starting point:
void outSpaces (unsigned int num) {
static char *hundredSpaces = "<<insert 100 spaces here>>";
while (num >= 100) {
puts (hundredSpaces);
num -= 100;
}
printf ("%*.*s", num, num, hundredSpaces);
}
You need to be aware that function calls can be expensive, even with output buffering. In that case, it may be best to call puts once to output a hundred characters rather than call putchar a hundred times.
Perhaps:
void PrintSpaces (int num_spaces)
{
char *spaces = " "; /* twenty spaces */
while (num_spaces > 20)
{
puts (spaces);
num_spaces -= 20;
}
if (num_spaces)
{
puts (&spaces [19 - num_spaces]);
}
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stddef.h>
int main() {
const size_t size = 5;
char* const data = (char*)malloc(size * sizeof(char) + 1);
if (!data) {
return EXIT_FAILURE;
}
memset(data, ' ', size);
data[size] = '\0'; // not needed (in this case)
fwrite(data, sizeof(char), size, stdout);
free(data);
return EXIT_SUCCESS;
}
(If the number of spaces isn't outrageous)
I don't known c, but here is the basic idea.
create an array of size 8192, and completely fill that particular array with spaces, now you can use puts or write system call or use something which is efficient, and then print this array.
Here I have a code snippet in go, but if you prefer c, you can see an example of how you do it, its actually GNU's yes program which is freaking fast at printing things, there is followed up explanation over there.
package main
import (
"bufio"
"os"
)
func main() {
t := []byte{'y', '\n'}
var used int
const tot = 8192
buf := make([]byte, 0, tot)
for used < tot {
buf = append(buf, t...)
used += 2
}
//Filled complete array named as buf with "y\n"
w := bufio.NewWriter(os.Stdout)
for {
w.Write(buf) //using write system call to print.
}
w.Flush()
}
//syscall.Write({without buf}) : 1.40MiB/s
//syscall.Write(buf) : 1.5GiB/s

Multithreading going throught an array of struct in C

I have wrote a program that receives as input a text file and return as output another text file.
The text file is created with a script(python) inside a 3D app (Blender) , and it contains a list of vertex that are part of a square mesh. The program receives that data, stores it in a struct, and return a list of vertex that forms a smaller square. Than, the 3D app, again with a script, reads this vertices and separate them from the original mesh. Doing this several times, the original mesh will be divided in many squares of the same area.
BY NOW, IT WORKS ;)
But is terribly low.. When doing it on 200k vertices it takes a while, but running it on 1kk vertices it takes ages
Here the source:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
typedef struct{
int index;
float x,y,z;
} vertex;
vertex *find_vertex(vertex *list, int len)
{
int i;
vertex lower,highter;
lower=list[0];
highter=list[1];
//find the lower lefter and the upper righter vertices
for(i=0;i<len;i++)
{
if ((list[i].x<=lower.x) && (list[i].y<=lower.y))
lower=list[i];
if ((list[i].x>=highter.x) && (list[i].y>=highter.y))
highter=list[i];
}
vertex *ret;//create a pointer for returning 2 structs
ret=(vertex*)malloc(sizeof(vertex)*2);
if (ret==NULL)
{
printf("Can't allocate the memory");
return 0;
}
ret[0]=lower;
ret[1]=highter;
return ret;
}
vertex *square_list_of_vertex(vertex *list,int len,vertex start, float size)
{
int i=0,a=0;
unsigned int *num;
num=(int*)malloc(sizeof(unsigned int)*len);
if (num==NULL)
{
printf("Can't allocate the memory");
return 0;
}
//controlls if point is in the right position and adds its index in the main list in another array
for(i=0;i<len;i++)
{
if ((list[i].x-start.x)<size && (list[i].y-start.y<size))
{
if (list[i].y-start.y>-size/100)//it was adding also wrong vertices. This line is to solve a bug
{
num[a]=i;
a++;//len of the return list
}
}
}
//create the list with the right vertices
vertex *retlist;
retlist=(vertex*)malloc(sizeof(vertex)*(a+1));
if (retlist==NULL)
{
printf("Can't allocate the memory");
return 0;
}
//the first index is used only as an info container
vertex infos;
infos.index=a+1;
retlist[0]=infos;
//set the value for the return pointer
for(i=1;i<=a;i++)
{
retlist[i]=list[num[i-1]];
}
return retlist;
}
//the function that pass the data to python
void return_funct_1(vertex lower,vertex highter)
{
FILE* ret;
ret=fopen("max_min.txt","w");
if (ret==NULL)
{
printf("Error opening the file\n");
return;
}
fprintf(ret,"%i\n",lower.index);
fprintf(ret,"%i\n",highter.index);
fclose(ret);
}
//the function that pass the data to python
void return_funct_2(vertex *squarelist)
{
FILE* ret;
int i,len;
ret=fopen("square_list.txt","w");
if (ret==NULL)
{
printf("Error opening the file\n");
return;
}
len=squarelist[0].index;
for(i=1;i<len;i++)
{
//return all the informations
//fprintf(ret,"%i %f %f %f\n",squarelist[i].index,squarelist[i].x,squarelist[i].y,squarelist[i].z);
//just return the index(it's enought for the python script)
fprintf(ret,"%i\n",squarelist[i].index);
}
fclose(ret);
}
//argv:
//function[1/2] number_of_vert(int) size_of_square(int) v_index(int) v_xcoord(float) v_ycoord(float) v_zcoord(float)...
//example of file: 2 4 2 0 1 2 3 1 1 2 3 2 1 2 3 3 1 2 3 4 1 2 3 //function 1, number of ver=4, size=2 and then the 4 vertex with their coords
int main(int argc, char *argv[])
{
if(argc==1)
{
printf("%s need a path to a vectorlist file\n",argv[0]);
return 0;
}
FILE* input;
input=fopen(argv[1],"r");
if (input==NULL)
{
printf("Error opening the file\n");
return(0);
}
int func=0,i=0,a=0,u=0;
char read;
char* argument;
argument=(char*)malloc(sizeof(char)*50);//yeah, i know, i should use list instead of an array, but when i first coded this i was quite in hurry (and i'm still learning )
//get the first paramater in the file
argument[0]=fgetc(input);
argument[1]='\0';
func=atoi(argument);
//skipp the space
read=fgetc(input);
//get the number of vertices;
i=0;
do {
read=fgetc(input);
argument[i]=read;
i++;
}while(read!=' ' && !feof(input) );
//set the end of the string
argument[i]='\0';
//set the variable to the correct integer value;
int vnumber=atoi(argument);
i=0;
do {
read=fgetc(input);
argument[i]=read;
i++;
} while(read!=' ' && !feof(input));
//set the end of the string
argument[i]='\0';
float sqsize=atof(argument);
vertex *list;
//allocate memory in the array to fit the number of vertex needed
list=(vertex*)malloc(sizeof(vertex)*vnumber);
//control if the memory get allocated
if (list==NULL)
{
printf("Can't allocate the memory");
return 0;
}
//do the cycle for each vertex
for(u=0;u<vnumber;u++)
{
//read the number and assign it to the proper value of the vertex
for(a=0;a<4;a++)
{
i=0;
do
{
read=fgetc(input);
argument[i]=read;
i++;
} while(read!=' ' && !feof(input));
argument[i]='\0';
if(a==0)
list[u].index=atoi(argument);
if(a==1)
list[u].x=atof(argument);
if(a==2)
list[u].y=atof(argument);
if(a==3)
list[u].z=atof(argument);
}
}
//close the file
fclose(input);
if (func==1)
{
//find the lowest vertex and the higtest vertex
vertex lower;
vertex highter;
vertex *lohi;
lohi=(vertex*)find_vertex(list, vnumber);
lower=lohi[0];
highter=lohi[1];
free(lohi);
return_funct_1(lower,highter);//the function that return the data to python
return 1;
}
if(func==2)
{
//find the list to return
vertex *lohi;
lohi=(vertex*)find_vertex(list, vnumber);
vertex lower;
lower=lohi[0];
free(lohi);
return_funct_2(square_list_of_vertex(list,vnumber, lower, sqsize));//the function that return the data to python
return 1;
}
printf("Function argument was wrong: nothing was done\n");
}
I would really appreciate any help for making this multithreaded.. It takes ages to work on really big data(today i've tried with a 50mb text file, and after 20 mins it had run only 30 times(on the 26000 i needed)), and since quite all pc that will use this will have at least 4 cores, i would really like to get it multithreaded!
Thanks in advice! :)
Ps: if you need, i can post the python script code too, but it's quite full of calls to the internal api of the program, so i don't really know if it would be usefull.
I am not going to work specifically through your code but your algorithm may be able to apply Map and Reduce.
This is an article of how you can use it in C:
http://pages.cs.wisc.edu/~gibson/mapReduceTutorial.html
When I profile your code running over a sample dataset of 2 million random vertexes, with the source file preloaded into the page cache the bottleneck is the conversion of strings to floats (it still runs in only 5 seconds, though - so it's not that slow).
It is possible to multithread the conversion of strings to floats, and with careful coding you will get some gains this way. However, you will get much more bang for your buck if instead you change the Python code to write the floating point numbers in a binary format that can be directly loaded by the C code (with fread()). I believe you can use struct.pack on the Python side to achieve this.
The processing part of your code can certainly be improved too, but until it is the bottleneck I wouldn't worry about it.

Resources