First, look at my code (very simple, just to test)
struct task_struct *test_task;
int test_func(void *arg)
{
uint64_t cur = 0; //current time
uint64_t last = 0; //last time
while(!kthread_should_stop())
{
cur = get_cycle();
if (cur - last > 2410000000)
{
printk(KERN_INFO "this time: %llu\n", cur);
last = cur;
}
}
return 0;
}
void run_thread(void)
{
struct sched_param param;
param.sched_priority = 1;
test_task = kthread_create(&test_func, NULL, "MY_test");
kthread_bind(test_task, 2); //just use cpu 2
sched_setscheduler(test_task, SCHED_RR, ¶m);
wake_up_process(test_task);
printk(KERN_INFO"OK, Kernel Thread : %s, Running\n", test_task->comm);
}
static int __init test_init(void)
{
run_thread();
return 0;
}
static void __exit test_exit(void)
{
kthread_stop(test_task);
}
module_init(test_init);
module_exit(test_exit);
These are all code, The problem occurs when I remove this module, the system would crash.
I think it should be a problem on my understanding of the core processes, please help gets advice, thank you!
Please forgive me for this chinese english....
Related
I had a problem initializing the PMU in gem5 for an arm full system with the starter_fs.py in --cpu hpi.
i followed the instructions of this post Using perf_event with the ARM PMU inside gem5 and i managed to solve my problem. I added the patch and configure the system. I am not using perf. I try to access directly the registers and read them. As i see GEM5 has only some register events implemented. Can we add the others as well as :
for example EXC_TAKEN is not implemented. Is the following the way to add them?
self.addEvent(ProbeEvent(self,0x09, cpu, "EXC_TAKEN"))
#0x09: EXC_TAKEN ???
Also, reading the pmu event registers i manage to read them and extract the events but the pmccntr cycle register always returns zero? How gem5 increments this register? What are the steps to read the cycle reggister?
a code that i use to read using perf is the following:
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/perf_event.h>
#define NUM_NODES 100
#define NONE 9999
struct _NODE
{
int iDist;
int iPrev;
};
typedef struct _NODE NODE;
struct _QITEM
{
int iNode;
int iDist;
int iPrev;
struct _QITEM *qNext;
};
typedef struct _QITEM QITEM;
QITEM *qHead = NULL;
int AdjMatrix[NUM_NODES][NUM_NODES];
int g_qCount = 0;
NODE rgnNodes[NUM_NODES];
int ch;
int iPrev, iNode;
int i, iCost, iDist;
void print_path (NODE *rgnNodes, int chNode)
{
if (rgnNodes[chNode].iPrev != NONE)
{
//print_path(rgnNodes, rgnNodes[chNode].iPrev);
}
//printf (" %d", chNode);
fflush(stdout);
}
void enqueue (int iNode, int iDist, int iPrev)
{
QITEM *qNew = (QITEM *) malloc(sizeof(QITEM));
QITEM *qLast = qHead;
if (!qNew)
{
//fprintf(stderr, "Out of memory.\n");
exit(1);
}
qNew->iNode = iNode;
qNew->iDist = iDist;
qNew->iPrev = iPrev;
qNew->qNext = NULL;
if (!qLast)
{
qHead = qNew;
}
else
{
while (qLast->qNext) qLast = qLast->qNext;
qLast->qNext = qNew;
}
g_qCount++;
// ASSERT(g_qCount);
}
void dequeue (int *piNode, int *piDist, int *piPrev)
{
QITEM *qKill = qHead;
if (qHead)
{
// ASSERT(g_qCount);
*piNode = qHead->iNode;
*piDist = qHead->iDist;
*piPrev = qHead->iPrev;
qHead = qHead->qNext;
free(qKill);
g_qCount--;
}
}
int qcount (void)
{
return(g_qCount);
}
int dijkstra(int chStart, int chEnd)
{
for (ch = 0; ch < NUM_NODES; ch++)
{
rgnNodes[ch].iDist = NONE;
rgnNodes[ch].iPrev = NONE;
}
if (chStart == chEnd)
{
//printf("Shortest path is 0 in cost. Just stay where you are.\n");
}
else
{
rgnNodes[chStart].iDist = 0;
rgnNodes[chStart].iPrev = NONE;
enqueue (chStart, 0, NONE);
while (qcount() > 0)
{
dequeue (&iNode, &iDist, &iPrev);
for (i = 0; i < NUM_NODES; i++)
{
if ((iCost = AdjMatrix[iNode][i]) != NONE)
{
if ((NONE == rgnNodes[i].iDist) ||
(rgnNodes[i].iDist > (iCost + iDist)))
{
rgnNodes[i].iDist = iDist + iCost;
rgnNodes[i].iPrev = iNode;
enqueue (i, iDist + iCost, iNode);
}
}
}
}
//printf("Shortest path is %d in cost. ", rgnNodes[chEnd].iDist);
//printf("Path is: ");
//print_path(rgnNodes, chEnd);
//printf("\n");
}
}
int main(int argc, char *argv[]) {
int diff = 0;
uint64_t num_cycles_nominal=0;
uint64_t num_cycles_attack=0;
uint64_t counter_cpu_cycles = 0;
//system("./load-module");
int i,j,k;
FILE *fp;
static int perf_fd_cpu_cycles;
static struct perf_event_attr attr_cpu_cycles;
attr_cpu_cycles.size = sizeof(attr_cpu_cycles);
attr_cpu_cycles.exclude_kernel = 1;
attr_cpu_cycles.exclude_hv = 1;
attr_cpu_cycles.exclude_callchain_kernel = 1;
attr_cpu_cycles.type = PERF_TYPE_RAW;
attr_cpu_cycles.config = 0x11;
/* Open the file descriptor corresponding to this counter. The counter
should start at this moment. */
if ((perf_fd_cpu_cycles = syscall(__NR_perf_event_open, &attr_cpu_cycles, 0, -1, -1, 0)) == -1)
fprintf(stderr, "perf_event_open fail %d %d: %s\n", perf_fd_cpu_cycles, errno, strerror(errno));
if (argc<2) {
//fprintf(stderr, "Usage: dijkstra <filename>\n");
//fprintf(stderr, "Only supports matrix size is #define'd.\n");
}
/* open the adjacency matrix file */
fp = fopen (argv[1],"r");
/* make a fully connected matrix */
for (i=0;i<NUM_NODES;i++) {
for (j=0;j<NUM_NODES;j++) {
/* make it more sparce */
fscanf(fp,"%d",&k);
AdjMatrix[i][j]= k;
}
}
/* Get and close the performance counters. */
read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
//close(perf_fd_cpu_cycles);
printf("Number of cpu_cycles before: %d\n", counter_cpu_cycles);
num_cycles_nominal = counter_cpu_cycles;
/* Get and close the performance counters. */
read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
//close(perf_fd_cpu_cycles);
printf("Number of cpu_cycles after attack: %d\n", counter_cpu_cycles);
num_cycles_attack = counter_cpu_cycles - num_cycles_nominal;
/* finds 10 shortest paths between nodes */
for (i=0,j=NUM_NODES/2;i<100;i++,j++) {
j=j%NUM_NODES;
dijkstra(i,j);
}
read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
close(perf_fd_cpu_cycles);
printf("Number of cpu_cycles end: %d\n", counter_cpu_cycles);
num_cycles_nominal = counter_cpu_cycles - num_cycles_attack;
printf("Number of cpu_cycles nominal: %d\n", num_cycles_nominal);
printf("Number of cpu_cycles attack: %d\n", num_cycles_attack);
exit(0);
}
the problem is that i can read the branch misses with perf having 0x10 instead 0f 0x11 (cycle counters RAW EVENT in GEM5) but using 0x11 for reading the cycles i get zero. When i try to reverse engineer the increment of cycle counter i do the following comments:
when simple/atomic or simple/timing i see that updateCycleCounter is called from the base.hh, also for the 03 cpu model. When HPI and considering that hpi is a MinorCPU model i see that updateCycleCounter is called only in POWER_STATE_ON, but i didnt find in the code a POWER_STATE_ON reference updateCycleCounter(CPU_STATE_ON) which will update the cycle counter. Please help me verify this assumption.
*****The problem was that in the MinorCPU the updateCycleCounter wasnt called for the CPU_STATE_ON which updates the ActiveCycles. It was fixed by the following patch https://gem5-review.googlesource.com/c/public/gem5/+/38095 .
I'm trying to build my own hypervisor for Linux (5.0.x kernel) on an Intel chip and I'm running into an odd problem.
Whenever I try to execute VMXON on more than one processor, it fails.
I made sure that VMX is enabled, that I allocated an aligned page and wrote VMCS REV ID into the VMXON region, but I'm not sure where the issue is.
This is my code:
vmx.c:
#include "vmx.h"
typedef struct vmstate {
bool vmx_enabled;
void *vmxon_region;
phys_addr_t vmxon_physical;
void *vmcs_region;
phys_addr_t vmcs_physical;
} vmstate;
static DEFINE_PER_CPU(vmstate*, cpu_vms);
static inline int VMXON(phys_addr_t phys){
uint8_t ret;
// TODO: Signal VMX to PT, to avoid PT crashes (Processor Trace)
__asm__ __volatile__ (
"vmxon %[pa]; setna %[ret]"
: [ret]"=rm"(ret)
: [pa]"m"(phys)
: "cc", "memory"
);
return ret;
}
static inline void VMXOFF(void){
__asm__ __volatile__("vmxoff" : : : "cc");
}
static void enable_vmx_operation_cr(void){
// Enable 14th bit in CR4
__write_cr4(__read_cr4() | 0x2000);
}
static void disable_vmx_operation_cr(void){
__write_cr4(__read_cr4() & ~(0x2000));
}
static vmstate *create_vmstate(void){
vmstate *vms = kzalloc(sizeof(vmstate), GFP_KERNEL);
vms->vmx_enabled = false;
return vms;
}
static void alloc_vmxon_region(vmstate *vms){
// TODO: respect physical width as set by the IA32_VMX_BASIC[48] bit for 32bit support
uint32_t vmcs_revid = 0;
uint32_t hi = 0;
void *vmxon_region = kmalloc(4096, GFP_KERNEL);
rdmsr_safe(MSR_IA32_VMX_BASIC, &vmcs_revid, &hi);
memcpy(vmxon_region, &vmcs_revid, 4);
vms->vmxon_region = vmxon_region;
vms->vmxon_physical = virt_to_phys(vmxon_region);
}
static void teardown_vmstate(vmstate *vms){
if(vms->vmxon_region)
kfree(vms->vmxon_region);
}
void vmx_teardown(void){
int i;
vmstate* vms;
for_each_possible_cpu(i){
vms = per_cpu(cpu_vms, i);
if(vms->vmx_enabled == true) {
VMXOFF();
vms->vmx_enabled = false;
}
disable_vmx_operation_cr();
teardown_vmstate(vms);
kfree(vms);
}
}
int vmx_setup(void){
int i;
vmstate* vms;
printk(KERN_INFO "NUM CPUS: %d", num_possible_cpus());
for_each_possible_cpu(i) {
// Allocate vmstate for every processor
per_cpu(cpu_vms, i) = create_vmstate();
vms = per_cpu(cpu_vms, i);
alloc_vmxon_region(vms);
enable_vmx_operation_cr();
if(VMXON(vms->vmxon_physical)){
printk(KERN_ALERT "VMXON operation failed!");
vms->vmx_enabled = false;
}
else
vms->vmx_enabled = true;
}
for_each_possible_cpu(i){
vms = per_cpu(cpu_vms, i);
if(vms->vmx_enabled == false) {
printk(KERN_ALERT "Tearing down after VMXON fail!");
vmx_teardown();
return -1;
}
}
return 0;
}
vmx_setup is called by a device open file operation:
static int hyper_dev_open(struct inode* inode, struct file *filep){
int err;
printk(KERN_INFO "Enabling VMX operation!\n");
if((err = vmx_setup()))
return err;
return 0;
}
When I execute VMXON on another processor, the carry flag is set to 1, zero flag is 0.
The driver works however if I add a VMXOFF() right after VMXON() so two VMX operations aren't enabled in parallel.
Any suggestions would help :)
for_each_possible_cpu simply iterates over the available CPUs; it doesn’t change execution to run on each CPU in turn. The entire loop runs on a single CPU.
Because of this, you are trying to execute vmxon repeatedly on the same CPU, which is why it is failing.
I've a function I wrote in order to run a given function on all processors. It works perfectly well in all cases except the following case:
When I try to use it within a kprobe that I registered.
Here's some code:
static DEFINE_MUTEX(entryMutex);
static struct kretprobe my_kprobe = {
.entry_handler = (kprobe_opcode_t *) NULL,
.handler = (kprobe_opcode_t *) process_entry_callback,
.maxactive = 1000,
.data_size = 0
};
static int driver_init(void)
{
my_kprobe.kp.addr = (kprobe_opcode_t*)kallsyms_lookup_name("sys_execve");
if ((ret = register_kretprobe(&my_kprobe)) < 0)
return -1;
return 0;
}
void foo(void* nothing)
{
printk("In foo\n");
}
static int process_entry_callback(struct kretprobe_instance* instance, struct pt_regs* regs)
{
mutex_lock(&entryMutex);
for(int i = 0; i < 4; ++i) // assumes there are 4 processors
run_func(foo, NULL, i);
mutex_unlock(&entryMutex);
return 0;
}
void run_func_wrap(struct function_data* data)
{
data->func(data->context);
wake_up_process(data->waiting_task);
*(data->condition) = TRUE;
}
void run_func(SCHEDULED_FUNC func, void *context, int processor)
{
struct function_data data;
struct task_struct* th;
BOOLEAN condition = FALSE;
wait_queue_head_t queue;
init_waitqueue_head(&queue);
data.func = func;
data.waiting_task = current;
data.context = context;
data.condition = &condition;
th = kthread_create(sched_func_wrap, &data, "th");
kthread_bind(th, processor);
wake_up_process(th);
wait_event(queue, condition);
}
F
After the call to 'run_func' in process_entry_callback I can no longer run any programs. Every time I start a new program it just stuck. After a while I get 'processor lockup' warning in the system log.
I suspect that it has something to do with the IRQ levels.
Any suggestions ?
EDIT:
It also happens when using the following function:
smp_call_function_single
which can be found in smp.c # the Linux kernel source code.
instead of my function:
run_func
I am trying to create user level thread. Here is a sample of my code. Can any body help me what is the problem in this program.
#include<stdio.h>
#include<ucontext.h>
int thread_counter = 0;
int thread1, thread2;
int who_am_i;
struct TCB {
ucontext_t context;
void (* fun_ptr)();
};
struct TCB tcb[3];
char stack[2][8192];
//----------------------
int thread_create(void (*fun)()) {
static volatile int s;
thread_counter++;
s = 0;
getcontext(&tcb[thread_counter].context);
if(!s) {
tcb[thread_counter].context.uc_stack.ss_sp = stack[thread_counter];
tcb[thread_counter].context.uc_stack.ss_size = sizeof(stack[thread_counter]);
tcb[thread_counter].context.uc_link = &tcb[0].context;
tcb[thread_counter].fun_ptr = fun;
s = 1;
}
else {
tcb[who_am_i].fun_ptr();
}
return thread_counter;
}
void thread_yield(int next_thread) {
static volatile int switched;
switched = 0;
getcontext(&tcb[who_am_i].context);
if(!switched) {
switched = 1;
who_am_i = next_thread;
setcontext(&tcb[next_thread].context);
}
}
//----------------------
void f1() {
printf("start f1\n");
thread_yield(thread2);
printf("finish f1:\n");
}
void f2() {
printf("start f2\n");
thread_yield(thread1);
printf("finish f2\n");
}
//----------------------
int main() {
thread1 = thread_create(f1);
thread2 = thread_create(f2);
who_am_i = 0;
thread_yield(thread1);
return 0;
}
Thread is not switching properly. When I run it, it gives following output:
start f1
start f2
finish f2
Thank you
You have an undefined behavior situation.
In thread_create you increase thread_counter the first thing you do. So when you create the second thread thread_counter will be 2. Then you access stack[2] which will give you undefined behavior.
You also hardcode the uc_link member of the context to &tcb[0].context, which is never initialized due to your "premature" increment of thread_counter.
But the main problem is that you don't actually create a new context, you just get the context for the current thread. You should use makecontext for each thread instead.
I have found there is a resource contention when using localtime function in multi-threaded program.
But there is only one lock resource in the function? What might be the root cause for this bug?
Any suggestions or ideas on this issue is well appreciated.
code is as bellow, cut from glibc,
struct tm *
localtime (t)
const time_t *t;
{
return __tz_convert (t, 1, &_tmbuf); //tmbuf temp variable
}
struct tm *
__tz_convert (const time_t *timer, int use_localtime, struct tm *tp)
{
long int leap_correction;
int leap_extra_secs;
if (timer == NULL)
{
__set_errno (EINVAL);
return NULL;
}
__libc_lock_lock (tzset_lock);
/* Update internal database according to current TZ setting.
POSIX.1 8.3.7.2 says that localtime_r is not required to set tzname.
This is a good idea since this allows at least a bit more parallelism. */
tzset_internal (tp == &_tmbuf && use_localtime, 1);
if (__use_tzfile)
__tzfile_compute (*timer, use_localtime, &leap_correction,
&leap_extra_secs, tp);
else
{
if (! __offtime (timer, 0, tp))
tp = NULL;
else
__tz_compute (*timer, tp, use_localtime);
leap_correction = 0L;
leap_extra_secs = 0;
}
if (tp)
{
if (! use_localtime)
{
tp->tm_isdst = 0;
tp->tm_zone = "GMT";
tp->tm_gmtoff = 0L;
}
if (__offtime (timer, tp->tm_gmtoff - leap_correction, tp))
tp->tm_sec += leap_extra_secs;
else
tp = NULL;
}
__libc_lock_unlock (tzset_lock);
return tp;
}