I am trying to measure the size of L1 cache by reverse engineering. But I am getting somewhere wrong. When I compiled and run this code in my Linux terminal I was getting a sudden jump of cycles at index 32 but even that was a jump of just 100 cycles. But my cache size is 64. I have no idea why this is going wrong? Do we need to clear the cache before running this code ? And also how to find the associativity of cache?
#include<stdio.h>
unsigned long long int measurecycle(char *baseaddress,int index){
unsigned long flags;
unsigned long long int start,end;
unsigned int cycles_low,cycles_high,cycles_low1,cycles_high1;
char variable;
asm volatile (
"CPUID\n\t"
"RDTSC\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
"%rax", "%rbx", "%rcx", "%rdx");
variable = *(baseaddress+index);
asm volatile(
"RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1)::
"%rax", "%rbx", "%rcx", "%rdx");
start =(((unsigned long long)cycles_high << 32) | cycles_low );
end =(((unsigned long long)cycles_high1 << 32) | cycles_low1);
return end-start;
}
int main(){
char array[128];
char *baseaddress = &array[0];
printf("Base Address \t| Cycles taken \n");
int index;
for (index = 0;index < 512;index++){
printf("%p \t\t %d \t\t
%lld\n",baseaddress,index,measurecycle(baseaddress,index));
}
}
calling fucntions from main
global declarations int A=0, B=0, C=0;
void load()
{
system("clear");
printf("Enter 2 values: ");
scanf("%d %d",&B,&C);
__asm__ __volatile__("movl %1,%%ebx;"
"movl %2,%%ecx;"
:: "b" (B) , "c" (C)
);
printf("Loaded Successfully. (press any key to continue.)");
scanf("%d",&temp);
}
void Add()
{
system("clear");
__asm__("movl %ebx,%eax;"
"addl %ecx,%eax;"
:"=a" (Acc)
);
printf("Answer: %d (press any key to continue)",Acc);
scanf("%d",&temp);
}
void Sub()
{
system("clear");
__asm__("movl %ebx,%eax;"
"subl %ecx,%eax;"
:"=a" (Acc)
);
printf("Answer: %d (press any key to continue)",Acc);
scanf("%d",&temp);
}
void Mul()
{
system("clear");
__asm__("movl %ebx,%eax;"
"imull %ecx,%eax;"
:"=a" (Acc)
);
printf("Answer: %d (press any key to continue)",Acc);
scanf("%d",&temp);
}
void Div()
{
int rem;
system("clear");
__asm__("movl %0x0,%edx;"
"movl %ebx,%eax;"
"idivl %ecx"
:"=a" (Acc) , "=r" (rem)
);
__asm__("movl %1, %eax"
:: "a" (Acc)
);
printf("Quotient: %d Remainder= %d (press any key to continue)",Acc,rem);
scanf("%d",&temp);
}
here's a code using inline assembly in c and while compiling with gcc -g it gives error :
test3.c: In function ‘main’:
test3.c:22:13: error: output operand constraint lacks ‘=’
__asm__("movl %1,%ebx;" "movl %2,%ecx;" : "r" (B): "r" (C));
^~~~~~~
test3.c:22:13: error: output operand constraint lacks ‘=’
test3.c:22:13: error: invalid lvalue in asm output 0
references from :
https://www.codeproject.com/articles/15971/using-inline-assembly-in-c-c
Source Codes from the site is compiling perfectly but while compiling the above code it gives errors
Also tried asm volatile
but it again gcc is throwing :
error: bad register name `%%eax'
My new company project, they want the code run for the 32-bit, the compile server is a CentOS 5.0 with GCC 4.1.1, that was the nightmare.
There are lots of functions using in the project like __sync_fetch_and_add was given in GCC 4.1.2 and later.
I was told can not upgrade GCC version, so I have to make another solution after Googling for several hours.
When I wrote a demo to test, I just got the wrong answer, the code blow want to replace function __sync_fetch_and_add
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
static int count = 0;
int compare_and_swap(int* reg, int oldval, int newval)
{
register char result;
#ifdef __i386__
__asm__ volatile ("lock; cmpxchgl %3, %0; setz %1"
: "=m"(*reg), "=q" (result)
: "m" (*reg), "r" (newval), "a" (oldval)
: "memory");
return result;
#elif defined(__x86_64__)
__asm__ volatile ("lock; cmpxchgq %3, %0; setz %1"
: "=m"(*reg), "=q" (result)
: "m" (*reg), "r" (newval), "a" (oldval)
: "memory");
return result;
#else
#error:architecture not supported and gcc too old
#endif
}
void *test_func(void *arg)
{
int i = 0;
for(i = 0; i < 2000; ++i) {
compare_and_swap((int *)&count, count, count + 1);
}
return NULL;
}
int main(int argc, const char *argv[])
{
pthread_t id[10];
int i = 0;
for(i = 0; i < 10; ++i){
pthread_create(&id[i], NULL, test_func, NULL);
}
for(i = 0; i < 10; ++i) {
pthread_join(id[i], NULL);
}
//10*2000=20000
printf("%d\n", count);
return 0;
}
Whent I got the wrong result:
[root#centos-linux-7 workspace]# ./asm
17123
[root#centos-linux-7 workspace]# ./asm
14670
[root#centos-linux-7 workspace]# ./asm
14604
[root#centos-linux-7 workspace]# ./asm
13837
[root#centos-linux-7 workspace]# ./asm
14043
[root#centos-linux-7 workspace]# ./asm
16160
[root#centos-linux-7 workspace]# ./asm
15271
[root#centos-linux-7 workspace]# ./asm
15280
[root#centos-linux-7 workspace]# ./asm
15465
[root#centos-linux-7 workspace]# ./asm
16673
I realize in this line
compare_and_swap((int *)&count, count, count + 1);
count + 1 was wrong!
Then how can I implement the same function as __sync_fetch_and_add. The compare_and_swap function works when the third parameter is constant.
By the way, compare_and_swap function is that right? I just Googled for that, not familiar with assembly.
I got despair with this question.
………………………………………………………………………………………………………………………………………………………………………………………………………………………
after seeing the answer below,I use while and got the right answer,but seems confuse more.
here is the code:
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
static unsigned long count = 0;
int sync_add_and_fetch(int* reg, int oldval, int incre)
{
register char result;
#ifdef __i386__
__asm__ volatile ("lock; cmpxchgl %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (oldval + incre), "a" (oldval) : "memory");
return result;
#elif defined(__x86_64__)
__asm__ volatile ("lock; cmpxchgq %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (newval + incre), "a" (oldval) : "memory");
return result;
#else
#error:architecture not supported and gcc too old
#endif
}
void *test_func(void *arg)
{
int i=0;
int result = 0;
for(i=0;i<2000;++i)
{
result = 0;
while(0 == result)
{
result = sync_add_and_fetch((int *)&count, count, 1);
}
}
return NULL;
}
int main(int argc, const char *argv[])
{
pthread_t id[10];
int i = 0;
for(i=0;i<10;++i){
pthread_create(&id[i],NULL,test_func,NULL);
}
for(i=0;i<10;++i){
pthread_join(id[i],NULL);
}
//10*2000=20000
printf("%u\n",count);
return 0;
}
the answer goes right to 20000,so i think when you use sync_add_and_fetch function,you should goes with a while loop is stupid,so I write like this:
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
static unsigned long count = 0;
int compare_and_swap(int* reg, int oldval, int incre)
{
register char result;
#ifdef __i386__
__asm__ volatile ("lock; cmpxchgl %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (oldval + incre), "a" (oldval) : "memory");
return result;
#elif defined(__x86_64__)
__asm__ volatile ("lock; cmpxchgq %3, %0; setz %1" : "=m"(*reg), "=q" (result) : "m" (*reg), "r" (newval + incre), "a" (oldval) : "memory");
return result;
#else
#error:architecture not supported and gcc too old
#endif
}
void sync_add_and_fetch(int *reg,int oldval,int incre)
{
int ret = 0;
while(0 == ret)
{
ret = compare_and_swap(reg,oldval,incre);
}
}
void *test_func(void *arg)
{
int i=0;
for(i=0;i<2000;++i)
{
sync_add_and_fetch((int *)&count, count, 1);
}
return NULL;
}
int main(int argc, const char *argv[])
{
pthread_t id[10];
int i = 0;
for(i=0;i<10;++i){
pthread_create(&id[i],NULL,test_func,NULL);
}
for(i=0;i<10;++i){
pthread_join(id[i],NULL);
}
//10*2000=20000
printf("%u\n",count);
return 0;
}
but when i run this code with ./asm after g++ -g -o asm asm.cpp -lpthread.the asm just stuck for more than 5min,see top in another terminal:
3861 root 19 0 102m 888 732 S 400 0.0 2:51.06 asm
I just confused,is this code not the same?
The 64-bit compare_and_swap is wrong as it swaps 64 bits but int is only 32 bits.
compare_and_swap should be used in a loop which retries it until is succeeds.
Your result look right to me. lock cmpxchg succeeds most of the time, but will fail if another core beat you to the punch. You're doing 20k attempts to cmpxchg count+1, not 20k atomic increments.
To write __sync_fetch_and_add with inline asm, you'll want to use lock xadd. It's specifically designed to implement fetch-add.
Implementing other operations, like fetch-or or fetch-and, require a CAS retry loop if you actually need the old value. So you could make a version of the function that doesn't return the old value, and is just a sync-and without the fetch, using lock and with a memory destination. (Compiler builtins can make this optimization based on whether the result is needed or not, but an inline asm implementation doesn't get a chance to choose asm based on that information.)
For efficiency, remember that and, or, add and many other instructions can use immediate operands, so a "re"(src) constraint would be appropriate (not "ri" for int64_t on x86-64, because that would allow immediates too large. https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html). But cmpxchg, xadd, and xchg can't use immediates, of course.
I'd suggest looking at compiler output for modern gcc (e.g. on http://godbolt.org/) for functions using the builtin, to see what compilers do.
But beware that inline asm can compile correctly given one set of surrounding code, but not the way you expect given different code. e.g. if the surrounding code copied a value after using CAS on it (probably unlikely), the compiler might decide to give the asm template two different memory operands for "=m"(*reg) and "m"(*reg), but your asm template assumes they will always be the same address.
IDK if gcc4.1 supports it, but "+m"(*reg) would declare a read/write memory operand. Otherwise perhaps you can use a matching constraint to say that the input is in the same location as an earlier operand, like "0"(*reg). But that might only work for registers, not memory, I didn't check.
"a" (oldval) is a bug: cmpxchg writes EAX on failure.
It's not ok to tell the compiler you leave a reg unmodified, and then write an asm template that does modify it. You will get unpredictable behaviour from stepping on the compiler's toes.
See c inline assembly getting "operand size mismatch" when using cmpxchg for a safe inline-asm wrapper for lock cmpxchg. It's written for gcc6 flag-output, so you'll have to back-port that and maybe a few other syntax details to crusty old gcc4.1.
That answer also addresses returning the old value so it doesn't have to be separately loaded.
(Using ancient gcc4.1 sounds like a bad idea to me, especially for writing multi-threaded code. So much room for error from porting working code with __sync builtins to hand-rolled asm. The risks of using a newer compiler, like stable gcc5.5 if not gcc7.4, are different but probably smaller.)
If you're going to rewrite code using __sync builtins, the sane thing would be to rewrite it using C11 stdatomic.h, or GNU C's more modern __atomic builtins that are intended to replace __sync.
The Linux kernel does successfully use inline asm for hand-rolled atomics, though, so it's certainly possible.
If you truly are in such a predicament, I would start with the following header file:
#ifndef SYNC_H
#define SYNC_H
#if defined(__x86_64__) || defined(__i386__)
static inline int sync_val_compare_and_swap_int(int *ptr, int oldval, int newval)
{
__asm__ __volatile__( "lock cmpxchgl %[newval], %[ptr]"
: "+a" (oldval), [ptr] "+m" (*ptr)
: [newval] "r" (newval)
: "memory" );
return oldval;
}
static inline int sync_fetch_and_add_int(int *ptr, int val)
{
__asm__ __volatile__( "lock xaddl %[val], %[ptr]"
: [val] "+r" (val), [ptr] "+m" (*ptr)
:
: "memory" );
return val;
}
static inline int sync_add_and_fetch_int(int *ptr, int val)
{
const int old = val;
__asm__ __volatile__( "lock xaddl %[val], %[ptr]"
: [val] "+r" (val), [ptr] "+m" (*ptr)
:
: "memory" );
return old + val;
}
static inline int sync_fetch_and_sub_int(int *ptr, int val) { return sync_fetch_and_add_int(ptr, -val); }
static inline int sync_sub_and_fetch_int(int *ptr, int val) { return sync_add_and_fetch_int(ptr, -val); }
/* Memory barrier */
static inline void sync_synchronize(void) { __asm__ __volatile__( "mfence" ::: "memory"); }
#else
#error Unsupported architecture.
#endif
#endif /* SYNC_H */
The same extended inline assembly works for both x86 and x86-64. Only the int type is implemented, and you do need to replace possible __sync_synchronize() calls with sync_synchronize(), and each __sync_...() call with sync_..._int().
To test, you can use e.g.
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>
#include "sync.h"
#define THREADS 16
#define PERTHREAD 8000
void *test_func1(void *sumptr)
{
int *const sum = sumptr;
int n = PERTHREAD;
while (n-->0)
sync_add_and_fetch_int(sum, n + 1);
return NULL;
}
void *test_func2(void *sumptr)
{
int *const sum = sumptr;
int n = PERTHREAD;
while (n-->0)
sync_fetch_and_add_int(sum, n + 1);
return NULL;
}
void *test_func3(void *sumptr)
{
int *const sum = sumptr;
int n = PERTHREAD;
int oldval, curval, newval;
while (n-->0) {
curval = *sum;
do {
oldval = curval;
newval = curval + n + 1;
} while ((curval = sync_val_compare_and_swap_int(sum, oldval, newval)) != oldval);
}
return NULL;
}
static void *(*worker[3])(void *) = { test_func1, test_func2, test_func3 };
int main(void)
{
pthread_t thread[THREADS];
pthread_attr_t attrs;
int sum = 0;
int t, result;
pthread_attr_init(&attrs);
pthread_attr_setstacksize(&attrs, 65536);
for (t = 0; t < THREADS; t++) {
result = pthread_create(thread + t, &attrs, worker[t % 3], &sum);
if (result) {
fprintf(stderr, "Failed to create thread %d of %d: %s.\n", t+1, THREADS, strerror(errno));
exit(EXIT_FAILURE);
}
}
pthread_attr_destroy(&attrs);
for (t = 0; t < THREADS; t++)
pthread_join(thread[t], NULL);
t = THREADS * PERTHREAD * (PERTHREAD + 1) / 2;
if (sum == t)
printf("sum = %d (as expected)\n", sum);
else
printf("sum = %d (expected %d)\n", sum, t);
return EXIT_SUCCESS;
}
Unfortunately, I don't have an ancient version of GCC to test, so this has only been tested with GCC 5.4.0 and GCC-4.9.3 for x86 and x86-64 (using -O2) on Linux.
If you find any bugs or issues in the above, please let me know in a comment so I can verify and fix as needed.
Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 6 years ago.
Improve this question
I'm trying to make new system calls on a redhat 2.4.18, and iv'e created a wrapper function for one of the system calls.
i will now add the system call code and the wrapper code add explanation below .
the system call:
int sys_read_TODO(pid_t pid, int TODO_index, char *TODO_description, ssize_t description_size, int* status){
struct task_struct *curTask = current;
struct task_struct *requestedTask = find_task_by_pid(pid);
// check if the requested process is valid
// check if he even exsits, and than whether he is this process, or one of his descendants.
if(!isValidProcess(curTask,requestedTask)){
return -ESRCH;
}
// the given process is ourself or one of our decendants.
// now check if the arguments are valid
if(TODO_description == NULL || status == NULL || TODO_index < 1 || TODO_index > requestedTask->todoQueueLength){
return -EINVAL;
}
// search for the requested todo and get his description
struct list_head *pos;
struct todoNode *curNode;
int counter = 0;
char *reqDesc = NULL;
int reqStatus = 0;
int reqDescLen = 0;
list_for_each(pos,&(requestedTask->todoQueue)){
counter +=1;
if(counter == TODO_index){
curNode = (todoNode*)list_entry(pos,todoNode,listNode);
reqDesc = curNode->description;
reqStatus = curNode->status;
reqDescLen = curNode->descLen;
}
}
// we got the description, now lets see if his size is bigger than the size they requested
if(description_size < reqDescLen){
return -EINVAL;
}
// try to update the status
int bytes_left = copy_to_user(status,&(reqStatus),sizeof(int));
// should we update the status using *status = reqStatus ?
if(bytes_left > 0){
return -EFAULT;
}
// now we will try to copy the description to user space
bytes_left = copy_to_user(TODO_description,reqDesc,reqDescLen);
if(bytes_left > 0){
return -EFAULT;
}
// successful. return the number of description bytes copied, which is len.
return reqDescLen;
}
The wrapper:
int read_TODO(pid_t pid, int TODO_index, char *TODO_description, ssize_t description_size, int* status){
int res;
__asm__
(
"pushl %%eax;"
"pushl %%ebx;"
"pushl %%ecx;"
"pushl %%edx;"
"movl $244, %%eax;"
"movl %1, %%ebx;"
"movl %2, %%ecx;"
"movl %3, %%edx;"
"movl %4, %%esi;"
"movl %5, %%edi;"
"int $0x80;"
"movl %%eax,%0;"
"popl %%edi;"
"popl %%esi;"
"popl %%edx;"
"popl %%ecx;"
"popl %%ebx;"
"popl %%eax;"
: "=m" (res)
: "m" (pid) ,"m" (TODO_index) ,"m"(TODO_description) ,"m" (description_size) ,"m"(status)
);
printf("read_TODO: res = %d\n",res);
if (res < 0)
{
printf("read_TODO: res = %d\n",res);
errno = res;
printf("read_TODO: errno = %d\n",errno);
printf("read_TODO: res = %d\n",res);
res = -1;
}
return res;
}
so the story is as follows:
I added each process a list of todo assignments and this system calls is supposed to read a given todo assignment's description.
The system call might return an error code. in this case, i was instructed to put the error code in the errno, and have the function return -1.
so in the wrapper i checked if the result value from the system call was negative, and if it was, i update the errno and return -1.
the problem is that i noticed by the prints that when i try to put the value of res into the value of errno, both errno and res get assigned with strange values.
From a run example i got these prints:
read_TODO: res = -3
read_TODO: res = -3
read_TODO: errno = 134513874
read_TODO: res = 134513874
What is the problem? and how can i fix it?
Given the wrapper function for system call write :
ssize_t my_write(int fd, const void *buf, size_t count)
{
long __res;
__asm__ volatile
("int $0x80"
: "=a" (__res)
: "0" (4),"D" ((long)(fd)),"S" ((long)(buf)), "d" ((long)(count))
: "ebx","memory");
if (-125 <= __res && __res < 0)
{
errno = -__res;
__res = -1;
}
return __res;
}
I've tried it with the code (from int main()) :
int main() {
my_write(2,"an read error occured\n",26);
return 0;
}
However it doesn't work . Any idea why ?
Thanks
Your constraints are off, the file descriptor needs to go in EBX, the buffer in ECX (not EDI/ESI respectively like you have).
Try:
__asm__ volatile
("int $0x80"
: "=a" (__res)
: "0" (4),"b" ((long)(fd)),"c" ((long)(buf)), "d" ((long)(count))
: "memory");