This one is about dereferencing stucture variables in a chain. Please consider this code:
struct ChannelInfo
{
int iData1;
int iData2;
int iData3;
int iData4;
}
struct AppInfo
{
struct ChannelInfo gChanInfo[100];
} gAppInfo;
void main()
{
gAppInfo.gChannelInfo[50].iData1 = 1;
gAppInfo.gChannelInfo[50].iData2 = 2;
gAppInfo.gChannelInfo[50].iData3 = 3;
gAppInfo.gChannelInfo[50].iData4 = 4;
foo1();
foo2();
}
void foo1()
{
printf("Data1 = %d, Data2 = %d, Data3 = %d, Data4 = %d", gAppInfo.gChannelInfo[50].iData1, gAppInfo.gChannelInfo[50].iData2, gAppInfo.gChannelInfo[50].iData3, gAppInfo.gChannelInfo[50].iData4);
}
void foo2()
{
struct ChannelInfo* pCurrrentChan = &gAppInfo.gChanInfo[50];
printf("Data1 = %d, Data2 = %d, Data3 = %d, Data4 = %d", pCurrrentChan->iData1, pCurrrentChan->iData2, pCurrrentChan->iData3, pCurrrentChan->iData4);
}
Is foo2() any faster than foo1()? What happens if the array index was not a constant, being asked for by the user? I would be grateful if someone could profile this code.
this assembly version of your code could help you understand why your code is slower. But of course it could vary depending on the target architecture and you optimization flags ( Commpiling with O2 or O3 flags produce the same code for foo1 and foo2 )
In foo2 the address of ChannelInfo is stored in a register and address are calculated relative to the value stored in the register. Or in the worst case in the stack (local variable ) where in that case it could be as slow as foo1.
In foo1 the variable address for printf are calculated relative to the variable gAppInfo stored in memory heap (or in cache ).
As per #Ludin's request I added these numbers for reference :
Execution of an instruction : 1 ns
fetch from main memory : ~100 ns
assembly version with -O2 flags ( -Os and -O3 flags produce the same code )
Pondering things like this isn't meaningful and it is pre-mature optimization, because the code will get optimized so that both those functions are equivalent.
If you for some reason would not optimize the code, foo2() will be slightly slower because it yields a few more instructions.
Please not that the call to printf is approximately 100 times slower than the rest of the code in that function, so if you are truly concerned about performance you should rather focus on avoiding stdio.h instead of doing these kinds of mini-optimizations.
At the bottom of the answer I have included some benchmarking code for Windows. Because the printf call is so slow compared to the rest of the code, and we aren't really interested in benchmarking printf itself, I removed the printf calls and replaced them with volatile variables. Meaning that the compiler is required to perform the reads no matter level of optimization.
gcc test.c -otest.exe -std=c11 -pedantic-errors -Wall -Wextra -O0
Output:
foo1 5.669101us
foo2 7.178366us
gcc test.c -otest.exe -std=c11 -pedantic-errors -Wall -Wextra -O2
Output:
foo1 2.509606us
foo2 2.506889us
As we can see, the difference in execution time of the non-optimized code corresponds roughly to the number of assembler instructions produced (see the answer by #dvhh).
Unscientifically:
10 / (10 + 16) instructions = 0.384
5.67 / (5.67 + 7.18) microseconds = 0.441
Benchmarking code:
#include <stdlib.h>
#include <stdio.h>
#include <windows.h>
struct ChannelInfo
{
int iData1;
int iData2;
int iData3;
int iData4;
};
struct AppInfo
{
struct ChannelInfo gChannelInfo[100];
} gAppInfo;
void foo1 (void);
void foo2 (void);
static double get_time_diff_us (const LARGE_INTEGER* freq,
const LARGE_INTEGER* before,
const LARGE_INTEGER* after)
{
return ((after->QuadPart - before->QuadPart)*1000.0) / (double)freq->QuadPart;
}
int main (void)
{
/*** Initialize benchmarking functions ***/
LARGE_INTEGER freq;
if(QueryPerformanceFrequency(&freq)==FALSE)
{
printf("QueryPerformanceFrequency not supported");
return 0;
}
LARGE_INTEGER time_before;
LARGE_INTEGER time_after;
gAppInfo.gChannelInfo[50].iData1 = 1;
gAppInfo.gChannelInfo[50].iData2 = 2;
gAppInfo.gChannelInfo[50].iData3 = 3;
gAppInfo.gChannelInfo[50].iData4 = 4;
const size_t ITERATIONS = 1000000;
QueryPerformanceCounter(&time_before);
for(size_t i=0; i<ITERATIONS; i++)
{
foo1();
}
QueryPerformanceCounter(&time_after);
printf("foo1 %fus\n", get_time_diff_us(&freq, &time_before, &time_after));
QueryPerformanceCounter(&time_before);
for(size_t i=0; i<ITERATIONS; i++)
{
foo2();
}
QueryPerformanceCounter(&time_after);
printf("foo2 %fus\n", get_time_diff_us(&freq, &time_before, &time_after));
}
void foo1 (void)
{
volatile int d1, d2, d3, d4;
d1 = gAppInfo.gChannelInfo[50].iData1;
d2 = gAppInfo.gChannelInfo[50].iData2;
d3 = gAppInfo.gChannelInfo[50].iData3;
d4 = gAppInfo.gChannelInfo[50].iData4;
}
void foo2 (void)
{
struct ChannelInfo* pCurrrentChan = &gAppInfo.gChannelInfo[50];
volatile int d1, d2, d3, d4;
d1 = pCurrrentChan->iData1;
d2 = pCurrrentChan->iData2;
d3 = pCurrrentChan->iData3;
d4 = pCurrrentChan->iData4;
}
yes, foo2() is definitely faster than foo1() because foo2 refers a pointer to that memory block and everytime you access it just points there and fetches value from the mmory.
Related
I want to verify the role of volatile by this method. But my inline assembly code doesn't seem to be able to modify the value of i without the compiler knowing. According to the articles I read, I only need to write assembly code like __asm { mov dword ptr [ebp-4], 20h }, I think I write the same as what he did.
actual output:
before = 10
after = 123
expected output:
before = 10
after = 10
Article link: https://www.runoob.com/w3cnote/c-volatile-keyword.html
#include <stdio.h>
int main() {
int a, b;
// volatile int i = 10;
int i = 10;
a = i;
printf("before = %d\n", a);
// Change the value of i in memory without letting the compiler know.
// I can't run the following statement here, so I wrote one myself
// mov dword ptr [ebp-4], 20h
asm("movl $123, -12(%rbp)");
b = i;
printf("after = %d\n", b);
}
I want to verify the role of volatile ...
You can't.
If a variable is not volatile, the compiler may optimize; it does not need to do this.
A compiler may always treat any variable as volatile.
How to change the value of a variable without the compiler knowing?
Create a second thread writing to the variable.
Example
The following example is for Linux (under Windows, you need a different function than pthread_create()):
#include <stdio.h>
#include <pthread.h>
int testVar;
volatile int waitVar;
void * otherThread(void * dummy)
{
while(waitVar != 2) { /* Wait */ }
testVar = 123;
waitVar = 3;
return NULL;
}
int main()
{
pthread_t pt;
waitVar = 1;
pthread_create(&pt, 0, otherThread, NULL);
testVar = 10;
waitVar = 2;
while(waitVar != 3) { /* Wait */ }
printf("%d\n", testVar - 10);
return 0;
}
If you compile with gcc -O0 -o x x.c -lpthread, the compiler does not optimize and works like all variables are volatile. printf() prints 113.
If you compile with -O3 instead of -O0, printf() prints 0.
If you replace int testVar by volatile int testVar, printf() always prints 113 (independent of -O0/-O3).
(Tested with the GCC 9.4.0 compiler.)
Consider follwing header file, "tls.h":
#include <stdint.h>
// calling this function is expensive
uint64_t foo(uint64_t x);
extern __thread uint64_t cache;
static inline uint64_t
get(uint64_t x)
{
// if cache is not valid
if (cache == UINT64_MAX)
cache = foo(x);
return cache + x;
}
and source file "tls.c":
#include "tls.h"
__thread uint64_t cache = {0};
uint64_t foo(uint64_t x)
{
// imagine some calculations are performed here
return 0;
}
Below is example usage of get() function in "main.c":
#include "tls.h"
uint64_t t = 0;
int main()
{
uint64_t x = 0;
for(uint64_t i = 0; i < 1024UL * 1024 * 1024; i++){
t += get(i);
x++;
}
}
Presented files are compiled as following:
gcc -c -O3 tls.c
gcc -c -O3 main.c
gcc -O3 main.o tls.o
Examining the performance of loop in "main.c" revealed that compiler optimization is very poor. After disassembling the binary, it is clear that tls is being accessed in every iteration.
Execution time on my machine is 1.7s.
However, if I remove check for cache validity in get() method so that it looks like this:
static inline uint64_t
get(uint64_t x)
{
return cache + x;
}
the compiler is now able to create much faster code - it completely removes the loop and generates only only one "add" instruction. Execution time is ~0.02s.
Why the compiler is not able to optimize the first case? TLS variable cannot be changed by other threads so compiler should be able to optimize this, right?
Is there any other way I can optimize the get() function?
I am wondering if the following example is a Clang SA false positive, and if so, is there a way to suppress it?
The key here is that I am copying a structure containing bit-fields by casting it as a word instead of a field-by-field copy (or memcpy). Both field-by-field copy and memcpy doesn't trigger warnings, but copying as a word (after casting) raises an "uninitialized access" warning. This is on a embedded system where only word-access is possible and these types of word copies are common place.
Below is the example code:
#include <stdio.h>
#include <string.h>
struct my_fields_t {
unsigned int f0: 16;
unsigned int f1: 8;
unsigned int f2: 8;
};
int main(void) {
struct my_fields_t var1, var2;
// initialize all the fields in var1.
var1.f0 = 1;
var1.f1 = 2;
var1.f2 = 3;
// Method #1: copy var1 -> var2 as a word (sizeof(unsigned int) = 4).
unsigned int *src = (unsigned int *) &var1;
unsigned int *dest = (unsigned int *) &var2;
*dest = *src;
// Method #2: copy var1->var2 field-by-field [NO SA WARNINGS]
// var2.f0 = var1.f0;
// var2.f1 = var1.f1;
// var2.f2 = var1.f2;
// Method #3: use memcpy to copy var1 to var2 [NO SA WARNINGS]
// memcpy(&var2, &var1, sizeof(struct my_fields_t));
printf("%d, %d, %d\n", var1.f0, var1.f1, var1.f2);
printf("%d, %d, %d\n", var2.f0, var2.f1, var2.f2); // <--- Function call argument is an uninitialized value
printf("sizeof(unsigned int) = %ld\n", sizeof(unsigned int));
}
Here's the output:
$ clang --version
clang version 4.0.0 (tags/RELEASE_401/final)
Target: x86_64-unknown-linux-gnu
Thread model: posix
$ clang -Wall clang_sa.c
$ ./a.out
1, 2, 3
1, 2, 3
sizeof(unsigned int) = 4
$ scan-build clang clang_sa.c
scan-build: Using '<snipped>/clang-4.0' for static analysis
clang_sa.c:33:3: warning: Function call argument is an uninitialized value
printf("%d, %d, %d\n", var2.f0, var2.f1, var2.f2); // <--- Function call argument is an uninitialized value
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 warning generated.
scan-build: 1 bug found.
In the above example, it is quite clear that all the fields in var2 will be initialized by the word copy. So, clang SA shouldn't complain about un-intialized access.
I appreciate any help/insight.
In terms of suppressing a specific warning, from the documentation:
Q: How can I suppress a specific analyzer warning?
There is currently no solid mechanism for suppressing an analyzer warning, although this is currently being investigated. ...
But on the next question, it shows you that you can mark a block of code to be skipped over during static analysis by surrounding the code with an #ifdef block:
Q: How can I selectively exclude code the analyzer examines?
When the static analyzer is using clang to parse source files, it implicitly defines the preprocessor macro __clang_analyzer__. One can use this macro to selectively exclude code the analyzer examines. ...
So, you could do it like this:
#ifdef __clang_analyzer__
#define COPY_STRUCT(DEST, SRC) (DEST) = (SRC)
#else
#define COPY_STRUCT(DEST, SRC) do { \
const unsigned int *src = (const void *)&(SRC); \
unsigned int *dest = (void *)&(DEST); \
*dest = *src; \
} while(0)
#endif
COPY_STRUCT(var2, var1);
I tried compiling this code,
#include <stdlib.h>
struct rgb {
int r, g, b;
};
void adjust_brightness(struct rgb *picdata, size_t len, int adjustment) {
// assume adjustment is between 0 and 255.
for (int i = 0; i < len; i++) {
picdata[i].r += adjustment;
picdata[i].g += adjustment;
picdata[i].b += adjustment;
}
}
on OSX using this command,
$ cc -Rpass-analysis=loop-vectorize -c -std=c99 -O3 brightness.c
brightness.c:13:3: remark: loop not vectorized: unsafe dependent memory operations in loop [-Rpass-analysis=loop-vectorize]
for (int i = 0; i < len; i++) {
^
Can someone explain what is unsafe and dependent here? I'm learning about SIMD, and this was explained at the most obvious use for SIMD. I was hoping to learn how the compiler would generated SIMD instructions for a simple example. In my head, I expect the compiler to maybe instead of incrementing by 1, it would increment by enough to put the loop body into vector registers?
Do I misunderstand?
I'm writing some code which stores some data structures in a special named binary section. These are all instances of the same struct which are scattered across many C files and are not within scope of each other. By placing them all in the named section I can iterate over all of them.
This works perfectly with GCC and GNU ld. Fails on Mac OS X due to missing __start___mysection and __stop___mysection symbols. I guess llvm ld is not smart enough to provide them automatically.
In GCC and GNU ld, I use __attribute__((section(...)) plus some specially named extern pointers which are magically filled in by the linker. Here's a trivial example:
#include <stdio.h>
extern int __start___mysection[];
extern int __stop___mysection[];
static int x __attribute__((section("__mysection"))) = 4;
static int y __attribute__((section("__mysection"))) = 10;
static int z __attribute__((section("__mysection"))) = 22;
#define SECTION_SIZE(sect) \
((size_t)((__stop_##sect - __start_##sect)))
int main(void)
{
size_t sz = SECTION_SIZE(__mysection);
int i;
printf("Section size is %u\n", sz);
for (i=0; i < sz; i++) {
printf("%d\n", __start___mysection[i]);
}
return 0;
}
What is the general way to get a pointer to the beginning/end of a section with FreeBSD linker. Anyone have any ideas?
For reference linker is:
#(#)PROGRAM:ld PROJECT:ld64-127.2
llvm version 3.0svn, from Apple Clang 3.0 (build 211.12)
Similar question was asked about MSVC here: How to get a pointer to a binary section in MSVC?
You can get the Darwin linker to do this for you.
#include <stdio.h>
extern int start_mysection __asm("section$start$__DATA$__mysection");
extern int stop_mysection __asm("section$end$__DATA$__mysection");
// If you don't reference x, y and z explicitly, they'll be dead-stripped.
// Prevent that with the "used" attribute.
static int x __attribute__((used,section("__DATA,__mysection"))) = 4;
static int y __attribute__((used,section("__DATA,__mysection"))) = 10;
static int z __attribute__((used,section("__DATA,__mysection"))) = 22;
int main(void)
{
long sz = &stop_mysection - &start_mysection;
long i;
printf("Section size is %ld\n", sz);
for (i=0; i < sz; ++i) {
printf("%d\n", (&start_mysection)[i]);
}
return 0;
}
Using Mach-O information:
#include <mach-o/getsect.h>
char *secstart;
unsigned long secsize;
secstart = getsectdata("__SEGMENT", "__section", &secsize);
The above gives information about section declared as:
int x __attribute__((section("__SEGMENT,__section"))) = 123;
More information: https://developer.apple.com/library/mac/documentation/developertools/conceptual/machoruntime/Reference/reference.html