I'm trying to make a Quick Sort function using ARM assembly (Raspberry pi),
but it shows me segmentation error.
I think recursion process makes that error, while storing or loading with stacks.
Can you tell me how can I fix it?
I used ARM assembly code in https://en.wikibooks.org/wiki/Algorithm_Implementation/Sorting/Quicksort#ARM_Assembly
here,
I just typed it same. Just changing registers like 'r3'->'r2', 'r2'->'r1', 'r1'->'r0' ...
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SIZE 32
int main()
{
int arr[SIZE];
int max, min;
int i;
for (i = 0; i < SIZE; i++) {
arr[i] = rand() % 100;
}
asm(
"mov r0, #0\n\t"
"mov r1, #128\n\t"
"Loop3:\n\t"
"stmfd sp!, {r3, r5, lr}\n\t"
"mov r5, r1\n\t"
"Loop4:\n\t"
"sub r6, r5, r0\n\t"
"cmp r6, #4\n\t"
"ldmlefd sp!, {r3, r5, pc}\n\t"
"ldr r6, [%[arr],r0]\n\t"
"add r1, r0, #4\n\t"
"mov r3, r5\n\t"
"Loop5:\n\t"
"ldr r2, [%[arr],r1]\n\t"
"cmp r2, r6\n\t"
"addle r1, r1, #4\n\t"
"ble Loop6\n\t"
"sub r3, r3, #4\n\t"
"ldr r4, [%[arr],r3]\n\t"
"str r4, [%[arr],r1]\n\t"
"str r2, [%[arr],r3]\n\t"
"Loop6:\n\t"
"cmp r1, r3\n\t"
"blt Loop5\n\t"
"Loop7:\n\t"
"sub r1, r1, #4\n\t"
"ldr r2, [%[arr],r1]\n\t"
"str r2, [%[arr],r0]\n\t"
"str r6, [%[arr],r1]\n\t"
"bl Loop3\n\t"
"mov r0, r3\n\t"
"b Loop4\n\t"
:
:
[arr] "r"(arr)
:
"r0", "r1", "r2", "r3", "r4", "r5", "r6"
);
return 0;
}
You inline asm can never reach the end of the asm template. Presumably you're trying to return out of the C function, not just the internal recursive calls. That's obviously unsafe because there's zero guarantee about stack layout or the contents of LR, and that will change with/without optimization.
Don't write a whole recursive in the middle of a C function.
Use a debugger to single-step the resulting program and see where your code breaks the compiler-generated asm that surrounds it.
Also your inline asm is broken: you dereference arr without specifying it as a memory read/write input or a "memory" clobber. A pointer input does not imply that the pointed-to memory is also an operand.
Related
I'm working on STM32CubeIDE.
To mix C and arm assembly, we initially used EXPORT.
Below is main.c:
#include <stdio.h>
extern int calc(int num, int* cnt);
int main()
{
int cnt = 5;
calc(4, &cnt);
return 0;
}
And then calc.s:
AREA calculator, CODE
EXPORT calc
ALIGN
calc:PROC
PUSH {r4,r5,lr}
MOV r5, #13
UDIV r4, r0, r5
MUL r5, r4, r5
SUB r4, r0, r5
CMP r4,#0
BEQ ace
CMP r4,#8
BGT jqk
then: ADD r4, r4, #1
B exit
ace:ADD [r1],#1
MOV r4, #11
B exit
jqk:MOV r4, #10
exit:MOV r0, r4
POP {r4, r5, pc}
ENDP
I put the two files in the same place and built main.c, but I get an error that says it's an unknown external reference.
So after giving up, I tried to put the ASM sentence in the c file using inline assembly.
calc PROC
PUSH {r4, lr}
AND r4, r0, #12
CMP r4, #0
BEQ ace
CMP r4, #8
BGT jqk
then ADD r4, r4, #1
B exit
ace ADD r1, r1, #1
MOV r4, #11
B exit
jqk MOV r4, #10
exit MOV r0, r4
ENDP
I've written these assembly codes, and I've adapted them to inline grammar.
reference : Labels in GCC inline assembly
int calc(int num, int *cnt)
{
int rst=0;
int c = *cnt;
__asm volatile("MOV R0, %0": :"r"(num));
__asm volatile("AND R0, R0, %0": :"r"(0x12));
__asm volatile("MOV R1, %0": :"r"(c));
__asm volatile("CMP R0, #0");
__asm volatile("BEQ ace%=");
__asm volatile("CMP R0,#8");
__asm volatile("BGT jqk%=");
__asm volatile("ADD R2, R0, #1");
__asm volatile("B exit%=");
__asm volatile("ace%=: ADD R1, R1, #1");
__asm volatile("MOV R2, #11");
__asm volatile("B exit%=");
__asm volatile("jqk%=: MOV R2, #10");
__asm volatile("exit%=: LDR %0, R2":"=r"(rst):);
__asm volatile("LDR %0, R1":"=r"(c):);
__asm volatile("POP {R0, R1, R2}");
*cnt = c;
return rst;
}
But even in this case, an error appears.
What should I change in my code?
'''
int calc(int num, int *cnt)
{
int tmp = num % 13;
if (tmp == 0)
{
tmp = *cnt;
*cnt = tmp+1;
return 11;
}
else if (tmp > 8)
return 10;
else
return tmp + 1;
}
'''
You actually did the right thing initially, putting the asm code in a separate .s file and not using inline asm at all. The only thing is that you need to explicitly compile and link the calc.s file along with main.c
cc -o program main.c calc.s
should compile and assemble both files and link them. If you're using an IDE, you need to specify both main.c and calc.s as source files of the project.
I'm student and taking Microprocessor class.
Because of COVID-19, I had gotten a simple mid-term assignment writing inline assembly code of QuickSort.
Today, I got a score of this assignment and the score was almost the lowest because of "operation speed". (scored by rank, that mean my code works much slower than anyone else in class)
I tried to optimize my code before submission. So I have no idea why does my code work slowly.
TA said it is common for 85 ms to come out, but my code has an execution speed of 400ms.
I guess my code load a lot of unnecessary memory.
So my questions are:
Do I load unnecessary memory? (ie. Is my code use ldr, str instruction in unnecessary way?)
Does the number of memory loads affect the actual operation time?
Adding comment on "optimization", We only learned that the access speed of memory is slower then the register in the class, So I only tried to minimize the memory load and reduce unnecessary construction.
That mean someone can use "awesome tricks" to solve this problem but it is not general in my class.
Development environment:
GNU C Compiller(GCC) in Arm cortex M0 processor and nano Editor
Here is my code:
void QuickSort(int* arr, int size, int pivot, int end) {
int Lsize, Lstart, Lend, Rsize, Rstart, Rend;
//Note that all variables are mean index of Array, excep for temp
//so in inline assembly, We have to change them into byte
asm
(
/*check Condition*/
"MOV r10, #2\n\t"
"CMP r1, r10\n\t"
"BLT FINISH\n\t"
/*Initialize*/
"MOV r10, #4\n\t" //r10 is temp value
//"LDR r1, %[size]\n\t" //r1 is size of arr
//"LDR r2, %[pivot]\n\t" //r2 is pivot's address(start)
"MUL r2, r2, r10\n\t"
//"LDR r3, %[end]\n\t" //r3 is end's address
"MUL r3, r3, r10\n\t"
"MOV r4, r2\n\t" //r4 is low's address
"ADD r4, r4, #4\n\t" //low = pivot +1
"MOV r5, r3\n\t" //r5 is high's address
/*Initialize for loop*/
"LDR r6, [r0, r4]\n\t" //let r6 as value of low
"LDR r7, [r0, r2]\n\t" //let r7 as value of pivot
"LDR r8, [r0, r5]\n\t" //let r8 as value of high
/*Start Loop*/
"B L6\n\t" //Check Condition First
"LOOP2:\n\t" //while(low <= high)
"B L7\n\t" //check condition First
"LOOP3:\n\t" //while(arr[low]<=arr[pivot])
"ADDS r4, r4, #4\n\t" //low++
"LDR r6, [r0,r4]\n\t" //and update low's value
"L7: CMP r6, r7\n\t"
"BLE LOOP3\n\t"
"B L8\n\t" //check condition First
"LOOP4:\n\t" //while(arr[high]>=arr[pivot])
"SUBS r5, r5, #4\n\t" //high--
"LDR r8, [r0,r5]\n\t" //and update high's value
"L8: CMP r8, r7\n\t"
"BGE LOOP4\n\t"
"CMP r5, r2\n\t" //if(high<pivot)
"BGE L9\n\t"
"MOVS r5, r2\n\t" //high = pivot
"L9:\n\t"
"CMP r4, r5\n\t" //if(low >= high)
"BGE END\n\t" //break LOOP2
"LDR r9, [r0, r4]\n\t" //store r9 value of low
"LDR r10, [r0, r5]\n\t" //store r10 value of high
"STR r9, [r0, r5]\n\t"
"STR r10, [r0, r4]\n\t"
"MOV r6, r10\n\t"
"MOV r8, r9\n\t"
"L6: CMP r4, r5\n\t" //Compare whether low <= high
"BLE LOOP2\n\t" //If so, back to loop2
"END: \n\t" //This is end of loop2
/*swap pivot and high*/
"LDR r9, [r0, r5]\n\t" //store r9 vlaue of high
"LDR r10, [r0, r2]\n\t" //store r10 value of pivot
"STR r9, [r0, r2]\n\t"
"STR r10, [r0,r5]\n\t"
/*Set variables again*/
//Note that we have to divide them in 4
//Use r9 as quotient and r10 as zero
"MOV r9, #0\n\t"
"MOV r10, #0\n\t"
"B D1\n\t"
"sLoop1:\n\t"
"ADD r9, r9, #1\n\t"
"SUB r2, r2, #4\n\t"
"D1: CMP r2, r10\n\t"
"BGT sLoop1\n\t"
"MOV r2, r9\n\t" //pivot
"MOV r9, #0\n\t"
"B D2\n\t"
"sLoop2:\n\t"
"ADD r9, r9, #1\n\t"
"SUB r3, r3, #4\n\t"
"D2: CMP r3, r10\n\t"
"BGT sLoop2\n\t"
"MOV r3, r9\n\t" //end
"MOV r9, #0\n\t"
"B D4\n\t"
"sLoop4:\n\t"
"ADD r9, r9, #1\n\t"
"SUB r5, r5, #4\n\t"
"D4: CMP r5, r10\n\t"
"BGT sLoop4\n\t"
"MOV r5, r9\n\t" //high
/*Let r10 a temp of argument*/
"SUB r10, r5, r2\n\t" //Lsize = high-pivot
"STR r10, %[Lsize]\n\t"
"STR r2, %[Lstart]\n\t" //Lstart = pivot
"SUB r10, r5, #1\n\t" //Lend=high-1
"STR r10, %[Lend]\n\t"
"SUB r10, r3, r5\n\t" //Rsize = end-high
"STR r10, %[Rsize]\n\t"
"ADD r10, r5, #1\n\t" //Rstart=high+1
"STR r10, %[Rstart]\n\t"
"STR r3, %[Rend]\n\t" //Rend = end
://There is no output operands
:[arr] "r"(arr), [size] "m"(size) ,[pivot] "m"(pivot),[end] "m"(end), [Lsize] "m"(Lsize), [Lstart] "m"(Lstart), [Lend] "m"(Lend), [Rsize] "m"(Rsize), [Rstart] "m"(Rstart), [Rend] "m"(Rend)
:"r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
);
I am trying to implement a function which multiplies 32-bit operand with 256-bit operand in ARM assembly on ARM Cortex-a8. The problem is I am running out of registers and I have no idea how I can reduce the number of used registers here. Here is my function:
typedef struct UN_256fe{
uint32_t uint32[8];
}UN_256fe;
typedef struct UN_288bite{
uint32_t uint32[9];
}UN_288bite;
void multiply32x256(uint32_t A, UN_256fe* B, UN_288bite* res){
asm (
"umull r3, r4, %9, %10;\n\t"
"mov %0, r3; \n\t"/*res->uint32[0] = r3*/
"umull r3, r5, %9, %11;\n\t"
"adds r6, r3, r4; \n\t"/*res->uint32[1] = r3 + r4*/
"mov %1, r6; \n\t"
"umull r3, r4, %9, %12;\n\t"
"adcs r6, r5, r3; \n\t"
"mov %2, r6; \n\t"/*res->uint32[2] = r6*/
"umull r3, r5, %9, %13;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %3, r6; \n\t"/*res->uint32[3] = r6*/
"umull r3, r4, %9, %14;\n\t"
"adcs r6, r3, r5; \n\t"
"mov %4, r6; \n\t"/*res->uint32[4] = r6*/
"umull r3, r5, %9, %15;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %5, r6; \n\t"/*res->uint32[5] = r6*/
"umull r3, r4, %9, %16;\n\t"
"adcs r6, r3, r5; \n\t"
"mov %6, r6; \n\t"/*res->uint32[6] = r6*/
"umull r3, r5, %9, %17;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %7, r6; \n\t"/*res->uint32[7] = r6*/
"adc r6, r5, #0 ; \n\t"
"mov %8, r6; \n\t"/*res->uint32[8] = r6*/
: "=r"(res->uint32[8]), "=r"(res->uint32[7]), "=r"(res->uint32[6]), "=r"(res->uint32[5]), "=r"(res->uint32[4]),
"=r"(res->uint32[3]), "=r"(res->uint32[2]), "=r"(res->uint32[1]), "=r"(res->uint32[0])
: "r"(A), "r"(B->uint32[7]), "r"(B->uint32[6]), "r"(B->uint32[5]),
"r"(B->uint32[4]), "r"(B->uint32[3]), "r"(B->uint32[2]), "r"(B->uint32[1]), "r"(B->uint32[0]), "r"(temp)
: "r3", "r4", "r5", "r6", "cc", "memory");
}
EDIT-1: I updated my clobber list based on the first comment, but I still get the same error
A simple solution is to break this up and don't use 'clobber'. Declare the variables as 'tmp1', etc. Try not to use any mov statements; let the compiler do this if it has to. The compiler will use an algorithm to figure out the best 'flow' of information. If you use 'clobber', it can not reuse registers. They way it is now, you make it load all the memory first before the assembler executes. This is bad as you want memory/CPU ALU to pipeline.
void multiply32x256(uint32_t A, UN_256fe* B, UN_288bite* res)
{
uint32_t mulhi1, mullo1;
uint32_t mulhi2, mullo2;
uint32_t tmp;
asm("umull %0, %1, %2, %3;\n\t"
: "=r" (mullo1), "=r" (mulhi1)
: "r"(A), "r"(B->uint32[7])
);
res->uint32[8] = mullo1; /* was 'mov %0, r3; */
volatile asm("umull %0, %1, %3, %4;\n\t"
"adds %2, %5, %6; \n\t"/*res->uint32[1] = r3 + r4*/
: "=r" (mullo2), "=r" (mulhi2), "=r" (tmp)
: "r"(A), "r"(B->uint32[6]), "r" (mullo1), "r"(mulhi1)
: "cc"
);
res->uint32[7] = tmp; /* was 'mov %1, r6; */
/* ... etc */
}
The whole purpose of the 'gcc inline assembler' is not to code assembler directly in a 'C' file. It is to use the register allocation logic of the compiler AND do something that can not be easily done in 'C'. The use of carry logic in your case.
By not making it one huge 'asm' clause, the compiler can schedule the loads from memory as it needs new registers. It will also pipeline your 'UMULL' ALU activity with the load/store unit.
You should only use clobber if an instruction implicitly clobbers a specific register. You may also use something like,
register int *p1 asm ("r0");
and use that as an output. However, I don't know of any ARM instructions like this besides those that might alter the stack and your code doesn't use these and the carry of course.
GCC knows that memory changes if it is listed as an input/output, so you don't need a memory clobber. In fact it is detrimental as the memory clobber is a compiler memory barrier and this will cause memory to be written when the compiler might be able to schedule that for latter.
The moral is use gcc inline assembler to work with the compiler. If you code in assembler and you have huge routines, the register use can become complex and confusing. Typical assembler coders will keep only one thing in a register per routine, but that is not always the best use of registers. The compiler will shuffle the data around in a fairly smart way that is difficult to beat (and not very satisfying to hand code IMO) when the code size gets larger.
You might want to look at the GMP library which has lots of ways to efficiently tackle some of the same issues it looks like your code has.
I am trying to implement a function which multiplies 32-bit operand with 256-bit operand in ARM assembly on ARM Cortex-a8. The problem is I am running out of registers and I have no idea how I can reduce the number of used registers here. Here is my function:
typedef struct UN_256fe{
uint32_t uint32[8];
}UN_256fe;
typedef struct UN_288bite{
uint32_t uint32[9];
}UN_288bite;
void multiply32x256(uint32_t A, UN_256fe* B, UN_288bite* res){
asm (
"umull r3, r4, %9, %10;\n\t"
"mov %0, r3; \n\t"/*res->uint32[0] = r3*/
"umull r3, r5, %9, %11;\n\t"
"adds r6, r3, r4; \n\t"/*res->uint32[1] = r3 + r4*/
"mov %1, r6; \n\t"
"umull r3, r4, %9, %12;\n\t"
"adcs r6, r5, r3; \n\t"
"mov %2, r6; \n\t"/*res->uint32[2] = r6*/
"umull r3, r5, %9, %13;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %3, r6; \n\t"/*res->uint32[3] = r6*/
"umull r3, r4, %9, %14;\n\t"
"adcs r6, r3, r5; \n\t"
"mov %4, r6; \n\t"/*res->uint32[4] = r6*/
"umull r3, r5, %9, %15;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %5, r6; \n\t"/*res->uint32[5] = r6*/
"umull r3, r4, %9, %16;\n\t"
"adcs r6, r3, r5; \n\t"
"mov %6, r6; \n\t"/*res->uint32[6] = r6*/
"umull r3, r5, %9, %17;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %7, r6; \n\t"/*res->uint32[7] = r6*/
"adc r6, r5, #0 ; \n\t"
"mov %8, r6; \n\t"/*res->uint32[8] = r6*/
: "=r"(res->uint32[8]), "=r"(res->uint32[7]), "=r"(res->uint32[6]), "=r"(res->uint32[5]), "=r"(res->uint32[4]),
"=r"(res->uint32[3]), "=r"(res->uint32[2]), "=r"(res->uint32[1]), "=r"(res->uint32[0])
: "r"(A), "r"(B->uint32[7]), "r"(B->uint32[6]), "r"(B->uint32[5]),
"r"(B->uint32[4]), "r"(B->uint32[3]), "r"(B->uint32[2]), "r"(B->uint32[1]), "r"(B->uint32[0]), "r"(temp)
: "r3", "r4", "r5", "r6", "cc", "memory");
}
EDIT-1: I updated my clobber list based on the first comment, but I still get the same error
A simple solution is to break this up and don't use 'clobber'. Declare the variables as 'tmp1', etc. Try not to use any mov statements; let the compiler do this if it has to. The compiler will use an algorithm to figure out the best 'flow' of information. If you use 'clobber', it can not reuse registers. They way it is now, you make it load all the memory first before the assembler executes. This is bad as you want memory/CPU ALU to pipeline.
void multiply32x256(uint32_t A, UN_256fe* B, UN_288bite* res)
{
uint32_t mulhi1, mullo1;
uint32_t mulhi2, mullo2;
uint32_t tmp;
asm("umull %0, %1, %2, %3;\n\t"
: "=r" (mullo1), "=r" (mulhi1)
: "r"(A), "r"(B->uint32[7])
);
res->uint32[8] = mullo1; /* was 'mov %0, r3; */
volatile asm("umull %0, %1, %3, %4;\n\t"
"adds %2, %5, %6; \n\t"/*res->uint32[1] = r3 + r4*/
: "=r" (mullo2), "=r" (mulhi2), "=r" (tmp)
: "r"(A), "r"(B->uint32[6]), "r" (mullo1), "r"(mulhi1)
: "cc"
);
res->uint32[7] = tmp; /* was 'mov %1, r6; */
/* ... etc */
}
The whole purpose of the 'gcc inline assembler' is not to code assembler directly in a 'C' file. It is to use the register allocation logic of the compiler AND do something that can not be easily done in 'C'. The use of carry logic in your case.
By not making it one huge 'asm' clause, the compiler can schedule the loads from memory as it needs new registers. It will also pipeline your 'UMULL' ALU activity with the load/store unit.
You should only use clobber if an instruction implicitly clobbers a specific register. You may also use something like,
register int *p1 asm ("r0");
and use that as an output. However, I don't know of any ARM instructions like this besides those that might alter the stack and your code doesn't use these and the carry of course.
GCC knows that memory changes if it is listed as an input/output, so you don't need a memory clobber. In fact it is detrimental as the memory clobber is a compiler memory barrier and this will cause memory to be written when the compiler might be able to schedule that for latter.
The moral is use gcc inline assembler to work with the compiler. If you code in assembler and you have huge routines, the register use can become complex and confusing. Typical assembler coders will keep only one thing in a register per routine, but that is not always the best use of registers. The compiler will shuffle the data around in a fairly smart way that is difficult to beat (and not very satisfying to hand code IMO) when the code size gets larger.
You might want to look at the GMP library which has lots of ways to efficiently tackle some of the same issues it looks like your code has.
This code is meant to open opensl_es audio record capture a stream in mono, copy the stream and process left channel and right channel separately, then mix both channels into an output stream which is later played using opensl_es as well. the reason of the assembly code is because i found a bottle neck in the mixing function i had previously written in c which was a simple for loop to join left and right buffer into output buffer
well the problem is quite weird, when i put the logs i get in the output stream just what i want, the mixing of left and right buffer working and i see it in the logs, when i try to play the stream the application crashes, the same happens whenever i comment the logs, for some reason the app just crashes, so i'm starting to think it has something to do with the registers i am using or something in assembly code, i am new to assembly so is there something i am missing about arm assembly?
any idea why this is happening or how should i fix this problem?
here is the code: the first function is the main function which i use to capture sound call process functions. Te second "mux" is the function with the inline assembly in it.
void start_playing()
{
OPENSL_STREAM *pStream;
int samps, i, j;
short inbuffer[VECSAMPS_MONO], outbuffer[VECSAMPS_STEREO];
pStream = android_OpenAudioDevice(SR,1,2,BUFFERFRAMES);
if(pStream == NULL)
{
return;
}
on = 1;
iLog = 0;
while (on)
{
samps = android_AudioInRaw(pStream,inbuffer,VECSAMPS_MONO); //audio recording
//signal processing process called here for left channel then for right channel (equalizing, etc)
mux(inbuffer, inbuffer, outbuffer,VECSAMPS_MONO); //Assembly mixing of left and right channel into output channel
//android_AudioOutRaw(pStream,outbuffer,samps*2);//audio playing
}
android_CloseAudioDevice(pStream);
}
//assembly function here
void mux(short *pLeftBuf, short *pRightBuf, short *pOutBuf, int vecsamps_mono)
{
int *pIter;
*pIter = vecsamps_mono / 4;
__android_log_print(ANDROID_LOG_INFO, "$$$$$$$$$$$$", "value : %d , %d , %d , %d",pLeftBuf[0],pLeftBuf[1], pRightBuf[0],pRightBuf[1]);
asm volatile(
"ldr r9, %[outbuf];"
"ldr r0, %[leftbuf];"
"ldr r1, %[rightbuf];"
"ldr r2, %[iter];"
"ldr r8, [r2];"
"loop: "
"ldr r2, [r0];"
"ldr r3, [r1];"
"ldr r7, =0xffff;"
"and r4, r2, r7;"
"and r5, r3, r7;"
"lsl r5, r5, #16;"
"orr r4, r4, r5;"
"lsl r7, r7, #16;"
"and r5, r2, r7;"
"and r6, r3, r7;"
"lsr r6, r6, #16;"
"orr r5, r5, r6;"
"str r4, [r9];"
"str r5, [r9, #4];"
"add r0, r0, #4;"
"add r1, r1, #4;"
"add r9, r9, #8;"
"subs r8, r8, #1;"
"bne loop"
:[outbuf] "=m" (pOutBuf)
:[leftbuf] "m" (pLeftBuf) ,[rightbuf] "m" (pRightBuf),[iter] "m" (pIter)
:"r0","r1","r2","r3","r4","r5","r8","r9","memory","cc"
);
__android_log_print(ANDROID_LOG_INFO, "##################", "value : %d , %d , %d , %d" ,*pOutBuf,*(pOutBuf+1),*(pOutBuf+2) ,*(pOutBuf+3));
}
any suggestions?
this is the error i get in logcat:
01-14 11:41:40.992: A/libc(16161): Fatal signal 11 (SIGSEGV) at 0x00000000 (code=1), thread 16178 (Thread-4783)
*pIter = vecsamps_mono / 4;
...
"ldr r2, %[iter];"
"ldr r8, [r2];"
...
...[iter] "m" (pIter)
Maybe, just maybe, vecsamps_mono isn't 4 times a valid memory address.
Not that you even get that far, dereferencing an uninitialised pointer in that first line.