How the Average Cache Miss Ratio (ACMR) is calculated?

How the Average Cache Miss Ratio (ACMR) is calculated? - c

I'm studying Tom Forsyth's Linear-Speed Vertex Cache Optimization and i don't understand how he calculates the ACMR. From what i have read i already know that ACMR = number of cache misses / number of triangles, but what i don't understand is what kind of cache is being used (i.e. FIFO or LRU?).
I have written a test program that calculates and prints the ACMR of a given 3d model using a FIFO cache, can you please tell me if this code is ok? or should i use an LRU cache instead?
/* the number of entries in a FIFO cache */
#define FIFO_CACHE_SIZE 32
struct fifo_cache {
long entries[FIFO_CACHE_SIZE];
};
/**
* init_cache - initializes a FIFO cache
* #cache: A pointer to the FIFO cache structure to be initialized.
*
* Before a FIFO cache can be used, it must be initialized by calling this
* function.
*/
static void init_cache(struct fifo_cache *cache)
{
int i = 0;
/* initialize cache entries to an invalid value */
for (i = 0;i < FIFO_CACHE_SIZE;i++)
cache->entries[i] = -1;
}
/**
* check_entry - checks if the same entry is already added to the cache
* #cache: A pointer to the FIFO cache structure to be searched.
* #entry: An entry to be searched for.
*
* Return: If the same entry was found, the return value is nonzero. Otherwise,
* the return value is zero.
*/
static int check_entry(const struct fifo_cache *cache, u16 entry)
{
int i = 0;
for (i = 0;i < FIFO_CACHE_SIZE;i++) {
if (cache->entries[i] == (long)entry)
return 1;
}
return 0;
}
/**
* add_entry - adds a new entry to the FIFO cache
* #cache: A pointer to the FIFO cache structure the entry will be added to.
* #entry: An entry to add.
*/
static void add_entry(struct fifo_cache *cache, u16 entry)
{
long aux = 0;
long aux2 = 0;
int i = 0;
aux = cache->entries[0];
cache->entries[0] = (long)entry;
for (i = 1;i < FIFO_CACHE_SIZE;i++) {
aux2 = cache->entries[i];
cache->entries[i] = aux;
aux = aux2;
}
}
/**
* calculate_acmr - calculates the average cache miss ratio (aka. ACMR)
* #indices: The list of vertex indices.
* #count: The number of vertex indices in the #indices list.
*/
float calculate_acmr(const u16 *indices, size_t count)
{
struct fifo_cache cache = {0};
long total = 0; /* the total number of cache misses */
long i = 0;
/* initialize the cache */
init_cache(&cache);
for (i = 0;i < count;i++) {
if (!check_entry(&cache, indices[i])) {
/* an entry doesn't exist in the cache, so add it */
add_entry(&cache, indices[i]);
total++;
}
}
return ((float)total / (count / 3));
}

I found the answer. Modern GPUs uses FIFO caches for simplicity and speed, so it makes sense to calculate the ACMR using FIFO cache. The code given above is correct, so i'll keep using that.

You are correct that is the way hardware does it. Additionally you may want to read this: http://www.realtimerendering.com/blog/acmr-and-atvr/

Related

Is it possibly to use POSIX Message queue to transfer data between threads?

Is it possible to transfer data between threads like producer consumer using POSIX Message queue?
i need to transfer and an array of double with 5000 elements each from producer thread to consumer thread for processing
is POSIX Message queue designed for such a purpose?

POSIX message queues are absolutely the wrong tool for that.
All you actually need, is a buffer, a couple of counters or pointers, a mutex, and a couple of condition variables:
static pthread_mutex_t buffer_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t buffer_more = PTHREAD_COND_INITIALIZER;
static pthread_cond_t buffer_room = PTHREAD_COND_INITIALIZER;
/* Pointer and counters are volatile, since different threads may change them
whenever they hold the buffer_lock. */
static double * volatile buffer_data = NULL;
static volatile size_t buffer_size = 0;
static volatile size_t buffer_next = 0; /* First/next buffered value */
static volatile size_t buffer_ends = 0; /* First unused byte in buffer */
/* Optional flag to indicate no more data to be produced or consumed */
static volatile int buffer_done = 0;
/* Helper function to repack the buffer; caller must hold buffer_lock. */
static inline void buffer_repack_locked(void)
{
if (buffer_ends > buffer_next) {
if (buffer_next > 0) {
memmove(buffer_data, buffer_data + buffer_next,
(buffer_ends - buffer_next) * sizeof buffer_data[0]);
buffer_ends -= buffer_next;
buffer_next = 0;
}
} else {
buffer_next = 0;
buffer_ends = 0;
}
}
To grow the buffer (at any point), you use
static int buffer_resize(size_t new_size)
{
pthread_mutex_lock(&buffer_lock);
/* First, repack the buffer to start of the area. */
buffer_repack_locked();
/* Do not lose any data, however. */
if (new_size < buffer_ends)
new_size = buffer_ends;
/* Reallocate. */
void *new_data = realloc(buffer_data, new_size * sizeof buffer_data[0]);
if (!new_data) {
/* Not enough memory to reallocate; old data still exists. */
pthread_mutex_unlock(&buffer_lock);
return -1;
}
/* Success. */
buffer_data = new_data;
buffer_size = new_size;
/* Wake up any waiters waiting on room in the buffer, just to be sure. */
pthread_cond_broadcast(&buffer_room);
pthread_mutex_unlock(&buffer_lock);
return 0;
}
Producer or producers add a block of data to the buffer using
static void buffer_add(const double *data, size_t count)
{
pthread_mutex_lock(&buffer_lock);
buffer_repack_locked();
while (count > 0) {
if (buffer_ends >= buffer_size) {
/* Buffer is full. Wait for more room, repack, retry. */
pthread_cond_wait(&buffer_room, &buffer_lock);
buffer_repack_locked();
continue;
}
/* How much can we add? */
size_t size = buffer_size - buffer_ends;
if (size > count)
size = count;
memmove(buffer_data + buffer_ends, data, size * sizeof buffer_data[0]);
buffer_ends += size;
/* Wake up a consumer waiting on more data */
pthread_cond_signal(&buffer_more);
/* Update to reflect the data already added */
data += size;
count -= size;
}
/* All data added. */
pthread_mutex_unlock(&buffer_lock);
}
Similarly, consumers get data from the buffer using
static size_t buffer_get(double *data, size_t min_size, size_t max_size)
{
size_t size, have = 0;
/* Make sure min and max size are in the right order. */
if (max_size < min_size) {
size = max_size;
max_size = min_size;
min_size = size;
}
pthread_mutex_lock(&buffer_lock);
while (1) {
/* No more data incoming? */
if (buffer_done) {
pthread_mutex_unlock(&buffer_lock);
return have;
}
/* Buffer empty? */
if (buffer_next >= buffer_ends) {
pthread_cond_wait(&buffer_more, &buffer_lock);
continue;
}
/* How much can we grab? */
size = buffer_ends - buffer_next;
if (have + size > max_size)
size = max_size - have;
memmove(data, buffer_data + buffer_next,
size * sizeof buffer_data[0]);
buffer_next += size;
/* Wake up a waiter for empty room in the buffer. */
pthread_cond_signal(&buffer_room);
/* Enough data to return? */
if (have >= min_size) {
pthread_mutex_lock(&buffer_lock);
return have;
}
}
}
While this does copy the data around quite a bit, it allows both producers and consumers to work on their own data in any size "chunks" they wish.
If your producers and consumers work on matrices, or other "packetized" data of some maximum size, it makes sense to use singly-linked lists of preallocated packets of data, and not a linear buffer:
struct data_packet {
struct data_packet *next;
size_t size; /* Maximum size of data */
size_t used; /* Or rows, cols if a matrix */
double data[];
};
struct data_queue {
pthread_mutex_t lock;
pthread_cond_t more;
pthread_cond_t room;
struct data_packet *queue;
struct data_packet *unused;
unsigned long produced; /* Optional, just information */
unsigned long consumed; /* Optional, just information */
volatile int done; /* Set when no more to be produced */
};
static void free_data_packets(struct data_packet *root)
{
while (root) {
struct data_packet *curr = root;
root = root->next;
curr->next = NULL;
curr->size = 0;
free(curr);
}
}
To initialize a data queue, we also need to generate some empty packets in it. This must be done before any threads start working with the queue:
/* Returns the count of data packets actually created,
or 0 if an error occurs (with errno set).
*/
size_t data_queue_init(struct data_queue *q,
const size_t size,
const size_t count)
{
if (!q) {
errno = EINVAL;
return 0;
}
pthread_mutex_init(&(q->lock), NULL);
pthread_cond_init(&(q->more), NULL);
pthread_cond_init(&(q->room), NULL);
q->queue = NULL;
q->unused = NULL;
q->produced = 0;
q->consumed = 0;
q->done = 0;
/* Makes no sense to request no data packets. */
if (count < 1) {
errno = EINVAL;
return 0;
}
/* Create a chain of empty packets of desired size. */
struct data_packet *curr, *unused = NULL;
size_t have = 0;
while (have < count) {
curr = malloc( sizeof (struct data_packet)
+ size * sizeof curr->data[0]);
if (!curr)
break;
curr->next = unused;
curr->size = size;
curr->used = 0;
unused = curr;
have++;
}
if (!have) {
errno = ENOMEM;
return 0;
}
/* Attach chain to data queue; done. */
q->unused = unused;
return have;
}
Producers grab a free packet from the data queue:
struct data_packet *data_queue_get_unused(struct data_queue *q)
{
/* Safety check. */
if (!q) {
errno = EINVAL;
return NULL;
}
pthread_mutex_lock(&(q->lock));
while (!q->done) {
struct data_packet *curr = q->unused;
/* No unused data packets free? */
if (!curr) {
pthread_cond_wait(&(q->room), &(q->lock));
continue;
}
/* Detach and clear. */
q->unused = curr->next;
curr->next = NULL;
curr->used = 0;
/* Successful. */
pthread_mutex_unlock(&(q->lock));
return curr;
}
/* Done is set. */
pthread_mutex_unlock(&(q->lock));
errno = 0;
return NULL;
}
The above may return NULL, when an error occurs (errno will be set to a nonzero error), or when the done flag is set (errno will be zero).
The producer must remember to set the used field to reflect the amount of data it produced in the packet. (It must not exceed size, though.)
The producer can work on the data packet as they wish; it is their "own", and no locking is needed.
When the producer has completed the packet, they append it to the data queue:
int data_queue_append(struct data_queue *q, struct data_packet *p)
{
/* Safety check. */
if (!q || !p) {
errno = EINVAL;
return -1;
}
p->next = NULL;
pthread_mutex_lock(&(q->lock));
/* Append to queue. */
struct data_packet *prev = q->queue;
if (!prev) {
q->queue = p;
} else {
while (prev->next)
prev = prev->next;
prev->next = p;
}
q->produced++;
/* Wake up a waiter for a new packet. */
pthread_cond_signal(&(q->more));
/* Done. */
pthread_mutex_unlock(&(q->lock));
return 0;
}
Similarly, a consumer grabs the next packet from the queue,
struct data_packet *data_queue_get(struct data_queue *q)
{
/* Safety check. */
if (!q) {
errno = EINVAL;
return NULL;
}
pthread_mutex_lock(&(q->lock));
while (1) {
struct data_packet *curr = q->queue;
/* No data produced yet? */
if (!curr) {
/* If the done flag is set, we're done. */
if (q->done) {
pthread_mutex_unlock(&(q->lock));
errno = 0;
return NULL;
}
/* Wait for a signal on 'more'. */
pthread_cond_wait(&(q->more), &(q->lock));
continue;
}
/* Detach and done. */
q->queue = curr->next;
curr->next = NULL;
q->consumed++;
pthread_mutex_unlock(&(q->lock));
return curr;
}
}
and freely works on it. Note that the above does not examine the done flag unless the queue is empty.
When it is completed the work on the packet, it returns it to the unused queue:
int data_queue_append_unused(struct data_queue *q, struct data_packet *p)
{
/* Safety check */
if (!q || !p) {
errno = EINVAL;
return -1;
}
/* Clear it. */
p->used = 0;
pthread_mutex_lock(&(q->lock));
/* Prepend to unused queue. */
p->next = q->unused;
q->unused = p;
/* Signal a waiter that a new packet is available. */
pthread_cond_signal(&(q->room));
/* Done. */
pthread_mutex_unlock(&(q->lock));
return 0;
}
This approach allows one or more consumers and one or more producers work on their own packets on their own pace, without using any locks et cetera, and without copying the data itself around. However, the packet size and number of packets concurrently being worked on are limited.
The queue must be initialized with unused packet count at least the total number of producers and consumers; I prefer about twice that, to maximize throughput when the time taken by each varies a bit. The above, however, does allow removal of empty packets from the unused queue, and/or appending new empty packets to the unused queue, at any point in time. (When appending, remember to signal on the data queue room condition variable.)
Finally, note that the produced and consumed counts refer to the queue itself. If you want consumed to reflect the number of packets already consumed, you can move the q->consumed++ from data_queue_get() to data_queue_append_unused() instead.

It will work, but be aware that the absolute maximum message size is 16 MB (HARD_MSGSIZEMAX) since Linux 3.5, and was 1 MB before that. The default message size limit is only 8 KB though, so you need to set it when you call mq_open() or your 5000 doubles won't fit in one message.

A message queue is meant to transfer data between processes. Since threads are a part of the same process, there is no need to send data first to the kernel and then receive it back. In case of threads, all the global data is visible to all threads. Signalling mechanism like mutex and condition variables are required to synchronize the availability of data between threads.

C malloc use case - realloc versus pre-computation

I want to create an array of structs on the heap from another data structure. Say there are N total elements to traverse, and (N-x) pointers (computed_elements) will be added to the array.
My naive strategy for this is to create an array (temp_array) size N on the stack and traverse the data structure, keeping track of how many elements need to be added to the array, and adding them to temp_array when I encounter them. Once I've finished, I malloc(computed_elements) and populate this array with the temp_array.
This is suboptimal because the second loop is unnecessary. However, I am weighing this against the tradeoff of constantly reallocating memory every iteration. Some rough code to clarify:
void *temp_array[N];
int count = 0;
for (int i = 0; i < N; i++) {
if (check(arr[i])) {
temp_array[count] = arr[i];
count++;
}
}
void *results = malloc(count * sizeof(MyStruct));
for (int i = 0; i < count; i++) {
results[i] = temp_array[i];
}
return results;
Thoughts would be appreciated.

One common strategy is to try to estimate the number of elements you're going to need (not a close estimate, more of a "On the order of..." type estimate). Malloc that amount of memory, and when you get "close" to that limit ("close" also being up for interpretation), realloc some more. Personally, I typically double the array when I get close to filling it.
-EDIT-
Here is the "ten minute version". (I've ensured that it builds and doesn't segfault)
Obviously I've omitted things like checking for the success of malloc/realloc, zeroing memory, etc...
#include <stdlib.h>
#include <stdbool.h>
#include <string.h> /* for the "malloc only version" (see below) */
/* Assume 500 elements estimated*/
#define ESTIMATED_NUMBER_OF_RECORDS 500
/* "MAX" number of elements that the original question seems to be bound by */
#define N 10000
/* Included only to allow for test compilation */
typedef struct
{
int foo;
int bar;
} MyStruct;
/* Included only to allow for test compilation */
MyStruct arr[N] = { 0 };
/* Included only to allow for test compilation */
bool check(MyStruct valueToCheck)
{
bool baz = true;
/* ... */
return baz;
}
int main (int argc, char** argv)
{
int idx = 0;
int actualRecordCount = 0;
int allocatedSize = 0;
MyStruct *tempPointer = NULL;
MyStruct *results = malloc(ESTIMATED_NUMBER_OF_RECORDS * sizeof(MyStruct));
allocatedSize = ESTIMATED_NUMBER_OF_RECORDS;
for (idx = 0; idx < N; idx++)
{
/* Ensure that we're not about to walk off the current array */
if (actualRecordCount == (allocatedSize))
{
allocatedSize *= 2;
/* "malloc only version"
* If you want to avoid realloc and just malloc everything...
*/
/*
tempPointer = malloc(allocatedSize);
memcpy(tempPointer, results, allocatedSize);
free(results);
results = tempPointer;
*/
/* Using realloc... */
tempPointer = realloc(results, allocatedSize);
results = tempPointer;
}
/* Check validity or original array element */
if (check(arr[idx]))
{
results[actualRecordCount] = arr[idx];
actualRecordCount++;
}
}
if (results != NULL)
{
free(results);
}
return 0;
}

One possibility is malloc for the size N, then run your loop, then realloc for size N-x. Memory fragmentation may result for small x.

The best re-usable code very often is scalable. Unless obligated to use small sizes, assume code will grow in subsequent applications and need to be reasonable efficient for large N. You do want to re-use good code.
realloc() an array by a factor of 4 or so as the need arises. Arrays may also shrink - no need to have a bloated array laying around. I've used grow by factor of 4 at intervals 1,4,16,64... and shrink intervals at 2,8,32,128... By having grow/shrink intervals apart from each other, it avoid lots of actively should N waver around an interval.
Even in small ways, like using size_t vs. int. Sure, with sizes like 1000, it makes no difference, but with code re-use, an application may push the limit: size_t is better for array indexes.
void *temp_array[N];
for (int i = 0; i < N; i++) {
void *temp_array[N];
for (size_t i = 0; i < N; i++) {

Identifying the start and end of functions in a C program

I am currently programming in C to find the complexity of functions in a program based on the number of lines in the functions. I will have to fopen an existing C file and proceed with the calculation. I know that there maybe some builtin tools for finding it. But still I want it to be programmed manually. Is there any specific method to find the start and end of the various functions in a C file?

Run this through C preprocessor. This way you strip comments, unroll macros, include #includes etc. Unless you want complexity of the user-readable code, this will produce results much more true.
Remove fixed strings. Anything between "" goes, note escaped quote \" doesn't close the string.
Scan the file. First { increases count of functions and begins scanning the body of a function. Observe depth. { increases depth, } decreases, as depth reaches 0 another } is the end of the function. Next { will be a new function, but as you scan the outside, if before reaching next { or EOF you encounter a ; - cancel any data collected on the last piece. That wasn't a function, it was a struct, an union or something like that.

I would recommend a 2-pass approach.
Pass 1: Remove any open or close braces inside comments (and optionally those in preprocessor directives).
Pass 2: Count open and close braces and whenever they match up (#open == #close) a function ends. The next open brace denotes the start of a new function.
This approach is not fail-safe. It may fail if the code contains preprocessor statements that violate good programming practice. If you encounter such code you may want to run your tool on the code after it has passed through the preprocessor stage.

I finally found a nice way to do this!
doxygen already does a lot of things to process functions and other things nicely.
generate doxygen conf like doxygen -g doxygen_conf
open the conf file with your favorite editor and set GENERATE_XML = YES. You might also wanna set RECURSIVE = YES and others needed for your project, and run doxygen. set also INPUT = [PATH_TO_PROJECT_BASE].
In your doxygen build directory, you will find html/ and xml/.
cd80#cd80 ~/lab/VulnVizOnLinux/linux-5.4.109 » cd build_doc
cd80#cd80 ~/lab/VulnVizOnLinux/linux-5.4.109/build_doc » ls
ExtractFunctions.ipynb html xml
cd80#cd80 ~/lab/VulnVizOnLinux/linux-5.4.109/build_doc »
(ignore ExtractFunctions.ipynb, that's mine)
cd to xml and open any of xml files and analyze it for a while.
Here's how I did it.
import os
import xml.etree.ElementTree as ET
base_path = '/home/cd80/lab/VulnVizOnLinux/linux-5.4.109/'
open_files = {}
doc = ET.parse('/home/cd80/lab/VulnVizOnLinux/linux-5.4.109/build_doc/xml/4_2kernel_2module-plts_8c.xml')
root = doc.getroot()
for func in root.findall(".//memberdef/[#kind='function']"):
name = func.find('./name').text
location = func.find('./location')
if 'bodyend' not in location.keys():
continue # this memberdef is not a definition of function
bodystart = int(location.attrib.get('bodystart'))
bodyend = int(location.attrib.get('bodyend'))
file_path = location.attrib.get('file')
file_path = os.path.join(base_path, file_path)
if file_path not in open_files.keys():
with open(file_path, 'rb') as f:
code = f.read().decode('utf-8')
open_files[file_path] = code
else:
code = open_files[file_path]
func_def = '\n'.join(code.split("\n")[bodystart-1:bodyend])
print(func_def)
print('='*30)
Result:
static struct plt_entry __get_adrp_add_pair(u64 dst, u64 pc,
enum aarch64_insn_register reg)
{
u32 adrp, add;
adrp = aarch64_insn_gen_adr(pc, dst, reg, AARCH64_INSN_ADR_TYPE_ADRP);
add = aarch64_insn_gen_add_sub_imm(reg, reg, dst % SZ_4K,
AARCH64_INSN_VARIANT_64BIT,
AARCH64_INSN_ADSB_ADD);
return (struct plt_entry){ cpu_to_le32(adrp), cpu_to_le32(add) };
}
==============================
struct plt_entry get_plt_entry(u64 dst, void *pc)
{
struct plt_entry plt;
static u32 br;
if (!br)
br = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_16,
AARCH64_INSN_BRANCH_NOLINK);
plt = __get_adrp_add_pair(dst, (u64)pc, AARCH64_INSN_REG_16);
plt.br = cpu_to_le32(br);
return plt;
}
==============================
bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b)
{
u64 p, q;
/*
* Check whether both entries refer to the same target:
* do the cheapest checks first.
* If the 'add' or 'br' opcodes are different, then the target
* cannot be the same.
*/
if (a->add != b->add || a->br != b->br)
return false;
p = ALIGN_DOWN((u64)a, SZ_4K);
q = ALIGN_DOWN((u64)b, SZ_4K);
/*
* If the 'adrp' opcodes are the same then we just need to check
* that they refer to the same 4k region.
*/
if (a->adrp == b->adrp && p == q)
return true;
return (p + aarch64_insn_adrp_get_offset(le32_to_cpu(a->adrp))) ==
(q + aarch64_insn_adrp_get_offset(le32_to_cpu(b->adrp)));
}
==============================
static bool in_init(const struct module *mod, void *loc)
{
return (u64)loc - (u64)mod->init_layout.base < mod->init_layout.size;
}
==============================
u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs,
void *loc, const Elf64_Rela *rela,
Elf64_Sym *sym)
{
struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core :
&mod->arch.init;
struct plt_entry *plt = (struct plt_entry *)sechdrs[pltsec->plt_shndx].sh_addr;
int i = pltsec->plt_num_entries;
int j = i - 1;
u64 val = sym->st_value + rela->r_addend;
if (is_forbidden_offset_for_adrp(&plt[i].adrp))
i++;
plt[i] = get_plt_entry(val, &plt[i]);
/*
* Check if the entry we just created is a duplicate. Given that the
* relocations are sorted, this will be the last entry we allocated.
* (if one exists).
*/
if (j >= 0 && plt_entries_equal(plt + i, plt + j))
return (u64)&plt[j];
pltsec->plt_num_entries += i - j;
if (WARN_ON(pltsec->plt_num_entries > pltsec->plt_max_entries))
return 0;
return (u64)&plt[i];
}
==============================
static int cmp_rela(const void *a, const void *b)
{
const Elf64_Rela *x = a, *y = b;
int i;
/* sort by type, symbol index and addend */
i = cmp_3way(ELF64_R_TYPE(x->r_info), ELF64_R_TYPE(y->r_info));
if (i == 0)
i = cmp_3way(ELF64_R_SYM(x->r_info), ELF64_R_SYM(y->r_info));
if (i == 0)
i = cmp_3way(x->r_addend, y->r_addend);
return i;
}
==============================
static bool duplicate_rel(const Elf64_Rela *rela, int num)
{
/*
* Entries are sorted by type, symbol index and addend. That means
* that, if a duplicate entry exists, it must be in the preceding
* slot.
*/
return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0;
}
==============================
static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num,
Elf64_Word dstidx, Elf_Shdr *dstsec)
{
unsigned int ret = 0;
Elf64_Sym *s;
int i;
for (i = 0; i < num; i++) {
u64 min_align;
switch (ELF64_R_TYPE(rela[i].r_info)) {
case R_AARCH64_JUMP26:
case R_AARCH64_CALL26:
if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE))
break;
/*
* We only have to consider branch targets that resolve
* to symbols that are defined in a different section.
* This is not simply a heuristic, it is a fundamental
* limitation, since there is no guaranteed way to emit
* PLT entries sufficiently close to the branch if the
* section size exceeds the range of a branch
* instruction. So ignore relocations against defined
* symbols if they live in the same section as the
* relocation target.
*/
s = syms + ELF64_R_SYM(rela[i].r_info);
if (s->st_shndx == dstidx)
break;
/*
* Jump relocations with non-zero addends against
* undefined symbols are supported by the ELF spec, but
* do not occur in practice (e.g., 'jump n bytes past
* the entry point of undefined function symbol f').
* So we need to support them, but there is no need to
* take them into consideration when trying to optimize
* this code. So let's only check for duplicates when
* the addend is zero: this allows us to record the PLT
* entry address in the symbol table itself, rather than
* having to search the list for duplicates each time we
* emit one.
*/
if (rela[i].r_addend != 0 || !duplicate_rel(rela, i))
ret++;
break;
case R_AARCH64_ADR_PREL_PG_HI21_NC:
case R_AARCH64_ADR_PREL_PG_HI21:
if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) ||
!cpus_have_const_cap(ARM64_WORKAROUND_843419))
break;
/*
* Determine the minimal safe alignment for this ADRP
* instruction: the section alignment at which it is
* guaranteed not to appear at a vulnerable offset.
*
* This comes down to finding the least significant zero
* bit in bits [11:3] of the section offset, and
* increasing the section's alignment so that the
* resulting address of this instruction is guaranteed
* to equal the offset in that particular bit (as well
* as all less signficant bits). This ensures that the
* address modulo 4 KB != 0xfff8 or 0xfffc (which would
* have all ones in bits [11:3])
*/
min_align = 2ULL << ffz(rela[i].r_offset | 0x7);
/*
* Allocate veneer space for each ADRP that may appear
* at a vulnerable offset nonetheless. At relocation
* time, some of these will remain unused since some
* ADRP instructions can be patched to ADR instructions
* instead.
*/
if (min_align > SZ_4K)
ret++;
else
dstsec->sh_addralign = max(dstsec->sh_addralign,
min_align);
break;
}
}
if (IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) &&
cpus_have_const_cap(ARM64_WORKAROUND_843419))
/*
* Add some slack so we can skip PLT slots that may trigger
* the erratum due to the placement of the ADRP instruction.
*/
ret += DIV_ROUND_UP(ret, (SZ_4K / sizeof(struct plt_entry)));
return ret;
}
==============================
int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
char *secstrings, struct module *mod)
{
unsigned long core_plts = 0;
unsigned long init_plts = 0;
Elf64_Sym *syms = NULL;
Elf_Shdr *pltsec, *tramp = NULL;
int i;
/*
* Find the empty .plt section so we can expand it to store the PLT
* entries. Record the symtab address as well.
*/
for (i = 0; i < ehdr->e_shnum; i++) {
if (!strcmp(secstrings + sechdrs[i].sh_name, ".plt"))
mod->arch.core.plt_shndx = i;
else if (!strcmp(secstrings + sechdrs[i].sh_name, ".init.plt"))
mod->arch.init.plt_shndx = i;
else if (!strcmp(secstrings + sechdrs[i].sh_name,
".text.ftrace_trampoline"))
tramp = sechdrs + i;
else if (sechdrs[i].sh_type == SHT_SYMTAB)
syms = (Elf64_Sym *)sechdrs[i].sh_addr;
}
if (!mod->arch.core.plt_shndx || !mod->arch.init.plt_shndx) {
pr_err("%s: module PLT section(s) missing\n", mod->name);
return -ENOEXEC;
}
if (!syms) {
pr_err("%s: module symtab section missing\n", mod->name);
return -ENOEXEC;
}
for (i = 0; i < ehdr->e_shnum; i++) {
Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset;
int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela);
Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info;
if (sechdrs[i].sh_type != SHT_RELA)
continue;
/* ignore relocations that operate on non-exec sections */
if (!(dstsec->sh_flags & SHF_EXECINSTR))
continue;
/* sort by type, symbol index and addend */
sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL);
if (!str_has_prefix(secstrings + dstsec->sh_name, ".init"))
core_plts += count_plts(syms, rels, numrels,
sechdrs[i].sh_info, dstsec);
else
init_plts += count_plts(syms, rels, numrels,
sechdrs[i].sh_info, dstsec);
}
pltsec = sechdrs + mod->arch.core.plt_shndx;
pltsec->sh_type = SHT_NOBITS;
pltsec->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
pltsec->sh_addralign = L1_CACHE_BYTES;
pltsec->sh_size = (core_plts + 1) * sizeof(struct plt_entry);
mod->arch.core.plt_num_entries = 0;
mod->arch.core.plt_max_entries = core_plts;
pltsec = sechdrs + mod->arch.init.plt_shndx;
pltsec->sh_type = SHT_NOBITS;
pltsec->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
pltsec->sh_addralign = L1_CACHE_BYTES;
pltsec->sh_size = (init_plts + 1) * sizeof(struct plt_entry);
mod->arch.init.plt_num_entries = 0;
mod->arch.init.plt_max_entries = init_plts;
if (tramp) {
tramp->sh_type = SHT_NOBITS;
tramp->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
tramp->sh_addralign = __alignof__(struct plt_entry);
tramp->sh_size = sizeof(struct plt_entry);
}
return 0;
}
==============================
Dirty but works just as how I wanted

Markov Clean Function Trouble

I am trying to write a function to clean up the hash table that is generated by this code
/*
* Markov chain random text generator.
*/
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "eprintf.h"
enum {
NPREF = 2, /* number of prefix words */
NHASH = 4093, /* size of state hash table array */
MAXGEN = 10000 /* maximum words generated */
};
typedef struct State State;
typedef struct Suffix Suffix;
struct State { /* prefix + suffix list */
char* pref[NPREF]; /* prefix words */
Suffix* suf; /* list of suffixes */
State* next; /* next in hash table */
};
struct Suffix { /* list of suffixes */
char* word; /* suffix */
Suffix* next; /* next in list of suffixes */
};
State* lookup(char *prefix[], int create);
void build(char *prefix[], FILE*);
void generate(int nwords);
void add(char *prefix[], char *word);
State* statetab[NHASH]; /* hash table of states */
char NONWORD[] = "\n"; /* cannot appear as real word */
/* markov main: markov-chain random text generation */
int main(void)
{
int i, nwords = MAXGEN;
char *prefix[NPREF]; /* current input prefix */
int c;
long seed;
setProgName("markov");
seed = time(NULL);
srand(seed);
for (i = 0; i < NPREF; i++) /* set up initial prefix */
prefix[i] = NONWORD;
build(prefix, stdin);
add(prefix, NONWORD);
generate(nwords);
return 0;
}
const int MULTIPLIER = 31; /* for hash() */
/* hash: compute hash value for array of NPREF strings */
unsigned int hash(char* s[NPREF])
{
unsigned int h;
unsigned char *p;
int i;
h = 0;
for (i = 0; i < NPREF; i++)
for (p = (unsigned char *) s[i]; *p != '\0'; p++)
h = MULTIPLIER * h + *p;
return h % NHASH;
}
/* lookup: search for prefix; create if requested. */
/* returns pointer if present or created; NULL if not. */
/* creation doesn't strdup so strings mustn't change later. */
State* lookup(char *prefix[NPREF], int create)
{
int i, h;
State *sp;
h = hash(prefix);
for (sp = statetab[h]; sp != NULL; sp = sp->next) {
for (i = 0; i < NPREF; i++)
if (strcmp(prefix[i], sp->pref[i]) != 0)
break;
if (i == NPREF) /* found it */
return sp;
}
if (create) {
sp = (State *) emalloc(sizeof(State));
for (i = 0; i < NPREF; i++)
sp->pref[i] = prefix[i];
sp->suf = NULL;
sp->next = statetab[h];
statetab[h] = sp;
}
return sp;
}
/* addsuffix: add to state. suffix must not change later */
void addsuffix(State *sp, char *suffix)
{
Suffix *suf;
suf = (Suffix *) emalloc(sizeof(Suffix));
suf->word = suffix;
suf->next = sp->suf;
sp->suf = suf;
}
/* add: add word to suffix list, update prefix */
void add(char *prefix[NPREF], char *suffix)
{
State *sp;
sp = lookup(prefix, 1); /* create if not found */
addsuffix(sp, suffix);
/* move the words down the prefix */
memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));
prefix[NPREF-1] = suffix;
}
/* build: read input, build prefix table */
void build(char *prefix[NPREF], FILE *f)
{
char buf[100], fmt[10];
/* create a format string; %s could overflow buf */
sprintf(fmt, "%%%ds", sizeof(buf)-1);
while (fscanf(f, fmt, buf) != EOF)
add(prefix, estrdup(buf));
}
/* generate: produce output, one word per line */
void generate(int nwords)
{
State *sp;
Suffix *suf;
char *prefix[NPREF], *w;
int i, nmatch;
for (i = 0; i < NPREF; i++) /* reset initial prefix */
prefix[i] = NONWORD;
for (i = 0; i < nwords; i++) {
sp = lookup(prefix, 0);
if (sp == NULL)
eprintf("internal error: lookup failed");
nmatch = 0;
for (suf = sp->suf; suf != NULL; suf = suf->next)
if (rand() % ++nmatch == 0) /* prob = 1/nmatch */
w = suf->word;
if (nmatch == 0)
eprintf("internal error: no suffix %d %s", i, prefix[0]);
if (strcmp(w, NONWORD) == 0)
break;
printf("%s\n", w);
memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));
prefix[NPREF-1] = w;
}
}
Here is what I have so far for my clean function
/*Clean Function*/
void clean_up(State *sp)
{
State *temp;
Suffix *temp2, temp3;
for(int h = 0; h < NHASH; h++)
{
for (sp = statetab[h]; sp != NULL; sp = sp->next)
{
while(sp->suf != NULL)
{
temp2= sp->suf;
temp3= *temp2->next;
free(temp2);
sp->suf= &temp3;
}
}
}
}
I think im on the right track, I'm going through each index in the hash table, then going from state to state and freeing the suffixes. I'm not sure what to do about the prefixes, because I have to free them before I can free each state. Any help would be greatly appreciated.

In your code, you are copying into a temp3 node, which lives in automatic memory ("on the stack") pointing sp->suf to this memory will (on the next iteration of the loop) cause free to be called with the address of this object (which has not been obtained by malloc, and thus cannot be freed by free() )
void clean_up(State *sp)
{
State *temp;
Suffix *temp2, **pp;
for(int h = 0; h < NHASH; h++)
{
for (sp = statetab[h]; sp != NULL; sp = sp->next)
{
for (pp = &sp->suf; *pp; *pp = temp2)
{
temp2 = (*pp)->next;
free(*pp);
}
}
}
}

The example code is derived from the Markov program in The Practice of Programming by Kernighan and Pike, a most excellent book.
Given that you are trying to clean up the statetab, the main clean-up function doesn't need any argument. You do have to be careful not to free the states directly in statetab, but you do need to release auxilliary states chained off statetab[i].next.
typedef struct State State;
typedef struct Suffix Suffix;
struct State { /* prefix + suffix list */
char* pref[NPREF]; /* prefix words */
Suffix* suf; /* list of suffixes */
State* next; /* next in hash table */
};
struct Suffix { /* list of suffixes */
char* word; /* suffix */
Suffix* next; /* next in list of suffixes */
};
State* statetab[NHASH]; /* hash table of states */
static void free_state(State *state);
static void free_suffix(Suffix *suffix);
static void cleanup(void)
{
for (int i = 0; i < NHASH; i++)
free_state(statetab[i]);
}
static void free_state(State *state)
{
if (state != 0)
{
for (int i = 0; i < NPREF; i++)
free(state->pref[i]);
free_suffix(state->suf);
if (state->next != 0)
{
free_state(state->next);
free(state->next);
}
}
}
static void free_suffix(Suffix *suffix)
{
if (suffix != 0)
{
free(suffix->word);
free_suffix(suffix->next);
free(suffix);
}
}
Do you see how I've designed the free_xxxx() code based on the design of the xxxx structure?
Caveat Lector: uncompiled code, much less tested code.
I dug up the code from the TPOP site, and tried to apply it. I made some fixes to the freeing code above (syntax error fixed, the null checks in free_state() and free_suffix()), but the code as a whole was not designed to allow the data to be freed.
There are a couple of problems. First, a few of the prefixes are not allocated (NONWORD). It might be possible to avoid releasing those by testing whether a prefix is NONWORD, but that's nasty. It might be possible to allocate those prefixes too (replace NONWORD by estrdup(NONWORD)). I think there's another place, somewhere, that a non-allocated pointer is being stashed in a prefix in the state table; I'm getting crashes in malloc() complaining of 'freeing non-allocated memory' (which is distinct from 'double freeing allocated memory', I believe), but I've not managed to resolve that.
However, that then changes to another problem; the prefixes are reused. That is, almost every prefix in the system is used as the the second word of one prefix, then as the first word of the next prefix. Thus, you can't readily free the prefixes.
If you were to design this so that the memory could be released, then you'd probably design it so that there was a system of 'atoms' (immutable strings) such that each word was allocated once and reused as often as necessary (see C Interfaces and Implementations: Techniques for Creating Reusable Code by D Hanson for the source of the term). The code freeing the state table would then concentrate only on the non-word data. There'd be code to release the complete set of atoms as well.
I ran the Markov program under valgrind without the cleanup; there are no memory access problems and no leaked data; it is all still accessible at program exit. I was using a data file of about 15,000 words (and about 2900 distinct words), and the statistics were:
==9610== HEAP SUMMARY:
==9610== in use at exit: 695,269 bytes in 39,567 blocks
==9610== total heap usage: 39,567 allocs, 0 frees, 695,269 bytes allocated
==9610==
==9610== LEAK SUMMARY:
==9610== definitely lost: 0 bytes in 0 blocks
==9610== indirectly lost: 0 bytes in 0 blocks
==9610== possibly lost: 0 bytes in 0 blocks
==9610== still reachable: 695,269 bytes in 39,567 blocks
So, you set yourself an interesting exercise. However, I think it is not achievable without reworking some of the memory allocation mechanism so that the data can be freed cleanly.
(On BSD, and hence on Mac OS X too, there are a pair of functions in <stdlib.h> called setprogname() and getprogname(). On BSD, setprogname() is called automatically before the main() gets going (with argv[0], I believe). The declaration in eprintf.h conflicts with the declaration in <stdlib.h>, which may be why the code in the question uses setProgName() instead of the original setprogname(). I chose to fix setprogname() in eprintf.h so that it took a const char * argument and therefore matched the declaration in <stdlib.h>.)
TPOP was previously at
http://plan9.bell-labs.com/cm/cs/tpop and
http://cm.bell-labs.com/cm/cs/tpop but both are now (2015-08-10) broken.
See also Wikipedia on TPOP.

Implementing LRU algorithm

I am trying to implement the LRU algorithm, however I am really not so sure how to change the FIFO to LRU and to manipulate the page table in the
//The declared parameters are as follows:
pt_entry pte[MAX_PAGE]; /* page table */
int mem_size; /* physical memory size in page frames */
list free_list_head; /* free list */
list res_set_head; /* resident set */
int total_fault = 0; /* total number of page faults */
int total_ref = 0; /* total number of memory references */
unsigned short find_victim()
{
unsigned short frame;
list current;
frame = res_set_head->next->frame;
invalidate(frame);
current = res_set_head->next;
res_set_head->next = current->next;
res_set_head->next->prev = res_set_head;
to_resident_set(current);
return frame;
}
//-----------------------------------------------------------------
void resolve(int address)
{
unsigned short frame_alloc;
int virt_page;
static int disp_counter = 0;
virt_page = address >> 8;
if (pte[virt_page].valid == 1)
{
}
else
{
frame_alloc = find_frame();
pte[virt_page].valid = 1;
pte[virt_page].frame = frame_alloc;
total_fault++;
}
}

One way to implement LRU is to change your resolve() routine to unconditionally move pte[virt_page].frame to the bottom of the resident set list, so that it will be last to be evicted when a new frame is needed.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

How the Average Cache Miss Ratio (ACMR) is calculated? - c

I found the answer. Modern GPUs uses FIFO caches for simplicity and speed, so it makes sense to calculate the ACMR using FIFO cache. The code given above is correct, so i'll keep using that.

You are correct that is the way hardware does it. Additionally you may want to read this: http://www.realtimerendering.com/blog/acmr-and-atvr/

Related

Is it possibly to use POSIX Message queue to transfer data between threads?

C malloc use case - realloc versus pre-computation

Identifying the start and end of functions in a C program

Markov Clean Function Trouble

Implementing LRU algorithm

Categories

Resources