bpf how to inspect syscall arguments

bpf how to inspect syscall arguments - c

trace_output_kern.c traces sys_write syscall and prints the pid in userland:
#include <linux/ptrace.h>
#include <linux/version.h>
#include <uapi/linux/bpf.h>
#include "bpf_helpers.h"
struct bpf_map_def SEC("maps") my_map = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(u32),
.max_entries = 2,
};
SEC("kprobe/sys_write")
int bpf_prog1(struct pt_regs *ctx)
{
struct S {
u64 pid;
u64 cookie;
} data;
data.pid = bpf_get_current_pid_tgid();
data.cookie = 0x12345678;
bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data));
return 0;
}
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
sys_read has a signature of sys_read(unsigned int fd, char __user *buf, size_t count);, and currently we only see the PID. The premise of tracing is that we get to intercept, and inspect the arguments. I was trying to see the arguments that gets passed on as well.
If I change that struct S to hold a char array to hold char *buf as
struct S {
u64 pid;
u64 cookie;
char bleh[128]; //<-- added this
} data;
it is throwing a fit:
/usr/src/linux-5.4/samples/bpf# ./trace_output
bpf_load_program() err=13
0: (bf) r6 = r1
1: (85) call bpf_get_current_pid_tgid#14
2: (b7) r1 = 305419896
3: (7b) *(u64 *)(r10 -136) = r1
4: (7b) *(u64 *)(r10 -144) = r0
5: (bf) r4 = r10
6: (07) r4 += -144
7: (bf) r1 = r6
8: (18) r2 = 0xffff8975bd44aa00
10: (b7) r3 = 0
11: (b7) r5 = 144
12: (85) call bpf_perf_event_output#25
invalid indirect read from stack off -144+16 size 144
processed 12 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
0: (bf) r6 = r1
1: (85) call bpf_get_current_pid_tgid#14
2: (b7) r1 = 305419896
3: (7b) *(u64 *)(r10 -136) = r1
4: (7b) *(u64 *)(r10 -144) = r0
5: (bf) r4 = r10
6: (07) r4 += -144
7: (bf) r1 = r6
8: (18) r2 = 0xffff8975bd44aa00
10: (b7) r3 = 0
11: (b7) r5 = 144
12: (85) call bpf_perf_event_output#25
invalid indirect read from stack off -144+16 size 144
processed 12 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
if sys_write is a bad (question) example, I've been also trying to trace sys_execve, which has an arg list of
asmlinkage long sys_execve(const char __user *filename,
const char __user *const __user *argv,
const char __user *const __user *envp);
Please point me the correct direction, thanks!
Edit 1
How do I intercept the arguments that was used for __x64_sys_execve?
When I try this below,
#include <linux/ptrace.h>
#include <linux/version.h>
#include <uapi/linux/bpf.h>
#include "bpf_helpers.h"
struct bpf_map_def SEC("maps") my_map = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(u32),
.max_entries = 2,
};
//SEC("kprobe/sys_write")
SEC("kprobe/__x64_sys_execve")
/* Signature of sys_execve:
asmlinkage long sys_execve(const char __user *filename,
const char __user *const __user *argv,
const char __user *const __user *envp);
*/
int bpf_prog1(struct pt_regs *ctx, const char *filename)
{
struct S {
u64 pid;
u64 cookie;
char bleh[128];
} data;
data.pid = bpf_get_current_pid_tgid();
data.cookie = 0x12345678;
//bpf_get_current_comm(&data.bleh, 128);
bpf_probe_read(&data.bleh, 128, (void *)filename);
bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data));
return 0;
}
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
It blows up thusly:
/usr/src/linux-5.4/samples/bpf# ./borky
bpf_load_program() err=13
0: (bf) r6 = r2
R2 !read_ok
processed 1 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
0: (bf) r6 = r2
R2 !read_ok
processed 1 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0

The first part of your question was answered by pchaigno: if you extend your struct S and try to read it (bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data));) without having initialised it, the verifier complains, because reading uninitialised memory from the kernel introduces a security risk. What you could do is, for example, zero the whole struct when declaring it:
struct S {
u64 pid;
u64 cookie;
char bleh[128];
} data = {0};
Regarding the second part of your question with sys_execve, it turns out you cannot pass the syscall arguments to your function bpf_prog1() as you try to do. Your function should only take the struct pt_regs *ctx.
The confusion likely comes from the syntax used in bcc, where arguments are passed this way, but it is important to understand that bcc rewrites some parts under the hood, in particular this thing about accessing the arguments.
What you could use instead is the set of PT_REGS_PARM*(ctx) macros that are specifically defined to access the arguments of the probed function, from the relevant computer registers (example, definition). I think bcc also uses them when doing its rewriting job, but you wouldn't see it.

Related

spin locks in ebpf. what this error instruction dump means and how to use spin lock

I like to test this function bpf_spin_lock in ebpf kernel program.
This is my code
#include <stddef.h>
#include <linux/bpf.h>
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
/* Defines xdp_stats_map from packet04 */
#include "../common/xdp_stats_kern_user.h"
#include "../common/xdp_stats_kern.h"
#include <../common/parsing_helpers.h>
#include <bpf/bpf_helpers.h>
#define ETH_ALEN 6
#define MAX_ENTRIES 1000
/*struct {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
__type(key, __u32);
__type(value, __u64);
__uint(max_entries, MAX_ENTRIES);
} hash_map SEC(".maps");
*/
struct hash_elem {
int cnt;
struct bpf_spin_lock lock;
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, __u32);
__type(value, struct hash_elem);
__uint(max_entries, 100);
} hash_map SEC(".maps");
struct a{struct bpf_spin_lock lock;};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, __u32);
__type(value, long);
__uint(max_entries, 2);
} hash_map1 SEC(".maps");
//static __u32 i=0;
SEC("xdp")
int xdp_prog_simple(struct xdp_md *ctx)
{
struct hash_elem val;
bpf_spin_lock(&val.lock);
val.cnt++;
bpf_spin_unlock(&val.lock);
bpf_map_update_elem(&hash_map, &val.cnt, &val, BPF_ANY);
return XDP_PASS;
}
But to load program there is error
libbpf: load bpf program failed: Permission denied
libbpf: -- BEGIN DUMP LOG ---
libbpf:
; bpf_spin_lock(&val.lock);
0: (bf) r6 = r10
1: (07) r6 += -4
; bpf_spin_lock(&val.lock);
2: (bf) r1 = r6
3: (85) call bpf_spin_lock#93
R1 type=fp expected=map_value
processed 4 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
libbpf: -- END LOG --
libbpf: failed to load program 'xdp'
libbpf: failed to load object './k.o'
ERR: loading BPF-OBJ file(./k.o) (-22): Invalid argument
ERR: loading file: ./k.o
The last line of function instructions are
; bpf_spin_lock(&val.lock);
2: (bf) r1 = r6
3: (85) call bpf_spin_lock#93
R1 type=fp expected=map_value
is it saying bpf_spin_lock expects map? but this makes no sense, or correct me if its saying something, what the error means and the function assembly language dump
Can I use spin lock from the api in ebpf/xdp programs?

Cannot access __skb_buff in eBPF

I am trying to access the __sk_buff struct passed within the consume_skb tracepoint. However nothing I tried has worked so far. I am running the kernel version 5.4.0. This is what I tried so far, and in what it resulted:
Loader
gist
Direct access
SEC("tracepoint/skb/consume_skb")
int handle_skb(struct __sk_buff *skb)
{
void *data = (void *)(long)skb->data;
void *data_end = (void *)(long)skb->data_end;
struct ethhdr *eth = data;
struct iphdr *iph = data + sizeof(*eth);
if (data + sizeof(*eth) > data_end)
return 0;
if (data + sizeof(*eth) + sizeof(*iph) > data_end)
return 0;
bpf_trace_printk("%x\n", eth->h_proto);
return 0;
}
results in:
libbpf: prog 'handle_skb': BPF program load failed: Permission denied
libbpf: prog 'handle_skb': -- BEGIN PROG LOAD LOG --
; void *data = (void *)(long)skb->data;
0: (61) r2 = *(u32 *)(r1 +76)
; void *data_end = (void *)(long)skb->data_end;
1: (61) r1 = *(u32 *)(r1 +80)
; struct iphdr *iph = data + sizeof(*eth);
2: (bf) r3 = r2
3: (07) r3 += 14
; if (data + sizeof(*eth) > data_end)
4: (2d) if r3 > r1 goto pc+7
R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R3_w=inv(id=0,umin_value=14,umax_value=4294967309,var_off=(0x0; 0x1ffffffff)) R10=fp0
;
5: (bf) r3 = r2
6: (07) r3 += 34
; if (data + sizeof(*eth) > data_end)
7: (2d) if r3 > r1 goto pc+4
R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R3_w=inv(id=0,umin_value=34,umax_value=4294967329,var_off=(0x0; 0x1ffffffff)) R10=fp0
; bpf_trace_printk("%x\n", eth->h_proto);
8: (69) r2 = *(u16 *)(r2 +12)
R2 invalid mem access 'inv'
processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
-- END PROG LOAD LOG --
libbpf: failed to load program 'handle_skb'
libbpf: failed to load object 'skb_bpf'
libbpf: failed to load BPF skeleton 'skb_bpf': -13
Failed to load and verify BPF skeleton
The way I understand it, is that direct access to skb is not allowed in this context (why?).
bpf_skb_load_bytes
SEC("tracepoint/skb/consume_skb")
int handle_skb(struct __sk_buff *skb)
{
struct ethhdr *eth;
int ret;
ret = skb_load_bytes(skb, 0, eth, sizeof(struct ethhdr));
//Or alternatively
//ret = bpf_skb_load_bytes(skb, 0, eth, sizeof(struct ethhdr));
if (!ret)
{
return 0;
}
bpf_trace_printk("%x\n", eth->h_proto);
return 0;
}
which results in:
libbpf: prog 'handle_skb': BPF program load failed: Invalid argument
libbpf: prog 'handle_skb': -- BEGIN PROG LOAD LOG --
; ret = bpf_skb_load_bytes(skb, 0, eth, sizeof(struct ethhdr));
0: (b7) r2 = 0
1: (b7) r4 = 14
2: (85) call bpf_skb_load_bytes#26
unknown func bpf_skb_load_bytes#26
processed 3 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
-- END PROG LOAD LOG --
libbpf: failed to load program 'handle_skb'
libbpf: failed to load object 'skb_bpf'
libbpf: failed to load BPF skeleton 'skb_bpf': -22
Failed to load and verify BPF skeleton
This confuses me even more, as bpf_skb_load_bytes should be an available helper function.
Now I am doubting how to even access and read (not even write) the __sk_buff.
Is there just no way? Or have I missed something?

There are a few things going on here. First is that the argument to the tracepoint is of type struct sk_buff, not struct __sk_buff. I believe you have to use thebpf_probe_read helper to read memory the pointer is pointing to.
Second is that bpf_skb_load_bytes would only work with a __sk_buff. Besides, not all helpers are available for each program type, tracing programs can't use the bpf_skb_load_bytes helper. Here is the code that lists all available helpers for tracepoint programs:
https://elixir.bootlin.com/linux/v5.18.1/source/kernel/trace/bpf_trace.c#L1415
https://elixir.bootlin.com/linux/v5.18.1/source/kernel/trace/bpf_trace.c#L1167

debugging with timer/signal always ends up in <timer_settime+16>

I am using gdb to debug a code that starts a timer. When the timer rings in gdb I always end up at instruction timer_settime+16.
Is this expected behavior?
As an example I slightly modified the code of timer_settime man page. The idea is to pass two arguments: a string of integers and a nsec value. The code launches the timer to ring after nsec, then copies the string.
I expected that by incrementing the nsec value, gdb stopped at different code lines, to end up inside the copy loop. However it always stops at .
So is this expected behavior?
Is it documented somewhere?
Is there a way to achieve what I expected (i.e.: launch a timer that when ring makes gdb stops where the program was just before (or after) the signal)? (always with nsec granularity).
Code:
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <signal.h>
#include <time.h>
#define CLOCKID CLOCK_REALTIME
#define SIG SIGUSR1
#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \
} while (0)
unsigned char OUT[32];
unsigned char IN[32];
unsigned char ascii2hex(char in){
unsigned char out;
if( ('0' <= in) && (in <= '9') )
out = in - '0';
if( ('A' <= in) && (in <= 'F') )
out = in - 'A' + 10;
if( ('a' <= in) && (in <= 'f') )
out = in - 'a' + 10;
return out;
}
void asciiStr2hex(char * in, unsigned char * out, unsigned int len){
int i = 0;
int j = 0;
for( i = 0; i < len; i+=2){
out[j++] = (ascii2hex(in[i ]) << 4) + ascii2hex(in[i+1]);
}
}
void testcode(unsigned char *out, unsigned char *in, unsigned int len){
unsigned int i;
for (i=0;i<len;i++)
out[i] = in[i];
}
static void print_siginfo(siginfo_t *si)
{
timer_t *tidp;
int or;
tidp = si->si_value.sival_ptr;
printf(" sival_ptr = %p; ", si->si_value.sival_ptr);
printf(" *sival_ptr = 0x%lx\n", (long) *tidp);
or = timer_getoverrun(*tidp);
if (or == -1)
errExit("timer_getoverrun");
else
printf(" overrun count = %d\n", or);
}
static void handler(int sig, siginfo_t *si, void *uc)
{
/* Note: calling printf() from a signal handler is not
strictly correct, since printf() is not async-signal-safe;
see signal(7) */
printf("Caught signal %d\n", sig);
print_siginfo(si);
signal(sig, SIG_IGN);
}
int main(int argc, char *argv[])
{
timer_t timerid;
struct sigevent sev;
struct itimerspec its;
long long freq_nanosecs;
//sigset_t mask;
struct sigaction sa;
if (argc != 3) {
fprintf(stderr, "Usage: %s <16byte> <time-nanosecs>\n",
argv[0]);
exit(EXIT_FAILURE);
}
asciiStr2hex(argv[1], IN, 32);
/* Establish handler for timer signal */
printf("Establishing handler for signal %d\n", SIG);
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = handler;
sigemptyset(&sa.sa_mask);
if (sigaction(SIG, &sa, NULL) == -1)
errExit("sigaction");
/* Block timer signal temporarily */
/* printf("Blocking signal %d\n", SIG);
sigemptyset(&mask);
sigaddset(&mask, SIG);
if (sigprocmask(SIG_SETMASK, &mask, NULL) == -1)
errExit("sigprocmask");
*/
/* Create the timer */
sev.sigev_notify = SIGEV_SIGNAL;
sev.sigev_signo = SIG;
sev.sigev_value.sival_ptr = &timerid;
if (timer_create(CLOCKID, &sev, &timerid) == -1)
errExit("timer_create");
printf("timer ID is 0x%lx\n", (long) timerid);
/* Start the timer */
freq_nanosecs = atoll(argv[2]);
its.it_value.tv_sec = freq_nanosecs / 1000000000;
its.it_value.tv_nsec = freq_nanosecs % 1000000000;
its.it_interval.tv_sec = its.it_value.tv_sec;
its.it_interval.tv_nsec = its.it_value.tv_nsec;
if (timer_settime(timerid, 0, &its, NULL) == -1)
errExit("timer_settime");
/* Sleep for a while; meanwhile, the timer may expire
multiple times */
printf("Sleeping for %d seconds\n", atoi(argv[1]));
testcode(OUT, IN, 16);
/* Unlock the timer signal, so that timer notification
can be delivered */
/* printf("Unblocking signal %d\n", SIG);
if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == -1)
errExit("sigprocmask");
*/
exit(EXIT_SUCCESS);
}
When debug into gdb with r 00112233445566778899001122334455 2
I obtain:
Program received signal SIGUSR1, User defined signal 1.
0x76fc7c38 in timer_settime () from /lib/arm-linux-gnueabihf/librt.so.1
(gdb) x/30i $pc
=> 0x76fc7c38 <timer_settime+16>: cmn r0, #4096 ; 0x1000
0x76fc7c3c <timer_settime+20>: mov r4, r0
0x76fc7c40 <timer_settime+24>: bhi 0x76fc7c4c <timer_settime+36>
0x76fc7c44 <timer_settime+28>: mov r0, r4
0x76fc7c48 <timer_settime+32>: pop {r3, r4, r7, pc}
0x76fc7c4c <timer_settime+36>: bl 0x76fc55b4
0x76fc7c50 <timer_settime+40>: rsb r3, r4, #0
0x76fc7c54 <timer_settime+44>: mvn r4, #0
0x76fc7c58 <timer_settime+48>: str r3, [r0]
0x76fc7c5c <timer_settime+52>: b 0x76fc7c44 <timer_settime+28>
0x76fc7c60 <timer_settime+56>: andeq r0, r0, r2, lsl #2
0x76fc7c64: push {r4, r5, r6, r7, r8, r9, r10, lr}
0x76fc7c68: sub sp, sp, #600 ; 0x258
0x76fc7c6c: ldr r4, [pc, #340] ; 0x76fc7dc8
0x76fc7c70: add r1, sp, #512 ; 0x200
0x76fc7c74: add r4, pc, r4
0x76fc7c78: mov r0, r4
0x76fc7c7c: bl 0x76fc56b0
0x76fc7c80: cmp r0, #0
0x76fc7c84: bne 0x76fc7c98
0x76fc7c88: ldr r2, [sp, #512] ; 0x200
0x76fc7c8c: ldr r3, [pc, #312] ; 0x76fc7dcc
0x76fc7c90: cmp r2, r3
0x76fc7c94: beq 0x76fc7d94
0x76fc7c98: ldr r5, [pc, #304] ; 0x76fc7dd0
0x76fc7c9c: ldr r0, [pc, #304] ; 0x76fc7dd4
0x76fc7ca0: add r5, pc, r5
0x76fc7ca4: add r0, pc, r0
0x76fc7ca8: mov r1, r5
0x76fc7cac: bl 0x76fc5524
I am running such code on a raspberry pi, but I'am pretty sure I had the same behavior on another linux machine x86_64.
I have tested with "handle stop SIGUSR1".

I finally found that the problem was that I have to set unwindonsignal off in gdb to obtain the behavior I expected.

How to measure program execution time in ARM Cortex-A53 processor?

I was using following method to read clock in cortex-a15:
static void readticks(unsigned int *result)
{
struct timeval t;
unsigned int cc;
if (!enabled) {
// program the performance-counter control-register:
asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(17));
//enable all counters
asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(0x8000000f));
//Clear overflow.
asm volatile("mcr p15, 0, %0, c9, c12, 3" :: "r"(0x8000000f));
enabled = 1;
}
asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(cc));
gettimeofday(&t,(struct timezone *) 0);
result[0] = cc;
result[1] = t.tv_usec;
result[2] = t.tv_sec;
}
And final performance profilinglooks like:
before = readticks();
foo();
after = readticks();
clock_cycles = after - before.
I want to use same logic in cortex-A53, ARM64 (not aarch32).
I have tried this after following online portals:
/* All counters, including PMCCNTR_EL0, are disabled/enabled */
#define QUADD_ARMV8_PMCR_E (1 << 0)
/* Reset all event counters, not including PMCCNTR_EL0, to 0
*/
#define QUADD_ARMV8_PMCR_P (1 << 1)
/* Reset PMCCNTR_EL0 to 0 */
#define QUADD_ARMV8_PMCR_C (1 << 2)
/* Clock divider: PMCCNTR_EL0 counts every clock cycle/every 64 clock cycles */
#define QUADD_ARMV8_PMCR_D (1 << 3)
/* Export of events is disabled/enabled */
#define QUADD_ARMV8_PMCR_X (1 << 4)
/* Disable cycle counter, PMCCNTR_EL0 when event counting is prohibited */
#define QUADD_ARMV8_PMCR_DP (1 << 5)
/* Long cycle count enable */
#define QUADD_ARMV8_PMCR_LC (1 << 6)
static inline unsigned int armv8_pmu_pmcr_read(void)
{
unsigned int val;
/* Read Performance Monitors Control Register */
asm volatile("mrs %0, pmcr_el0" : "=r" (val));
return val;
}
static inline void armv8_pmu_pmcr_write(unsigned int val)
{
asm volatile("msr pmcr_el0, %0" : :"r" (val & QUADD_ARMV8_PMCR_WR_MASK));
}
static void enable_all_counters(void)
{
unsigned int val;
/* Enable all counters */
val = armv8_pmu_pmcr_read();
val |= QUADD_ARMV8_PMCR_E | QUADD_ARMV8_PMCR_X;
armv8_pmu_pmcr_write(val);
}
static void reset_all_counters(void)
{
unsigned int val;
val = armv8_pmu_pmcr_read();
val |= QUADD_ARMV8_PMCR_P | QUADD_ARMV8_PMCR_C;
armv8_pmu_pmcr_write(val);
}
static void readticks(unsigned int *result)
{
struct timeval t;
unsigned int cc;
unsigned int val;
if (!enabled) {
reset_all_counters();
enable_all_counters();
enabled = 1;
}
cc = armv8_pmu_pmcr_read();
gettimeofday(&t,(struct timezone *) 0);
result[0] = cc;
result[1] = t.tv_usec;
result[2] = t.tv_sec;
}
But it gives "Illegal instruction" as error while I am trying profiling. Can anyone help me to change the above code for cortex-a53?

You need to enable the PMU for user mode. Here is the kernel module I wrote for it(For ARM V7 in Raspberry Pi 2):
/* Module source file 'module.c'. */
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
arm_write(unsigned long val)
{
//Enabling both read and write - note difference between mcr and mrc
asm volatile("mrc p15, 0, %0, c9, c14, 0" :: "r"(1));
asm volatile("mcr p15, 0, %0, c9, c14, 0" :: "r"(1));
}
static int enabler(void)
{
unsigned long value = 1;
printk(KERN_INFO "Enabling PMU usermode.\n");
arm_write(value);
return 0;
}
static void end(void)
{
printk(KERN_INFO "module unloaded.\n");
}
module_init(enabler);
module_exit(end);
MODULE_AUTHOR("Sama");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Blahblah");
This will enable user mode access to the PMU. once you compiled it , you need to enable the PMU counters as follow:
int main(int argc, char **argv){
int enable_divider =1;
int do_reset=1;
int value = 1;
// peform reset:
if (do_reset) {
value |= 2; // reset all counters to zero.
value |= 4; // reset cycle counter to zero.
}
if (enable_divider)
value |= 8; // enable "by 64" divider for CCNT. You really do not want to get all cycle count. This will increment the counter by 1 for every 64 cpu cycle.
value |= 16;
// program the performance-counter control-register with mask constructed above
asm volatile ("MCR p15, 0, %0, c9, c12, 0\t\n" :: "r"(value));
// enable all counters:
asm volatile ("MCR p15, 0, %0, c9, c12, 1\t\n" :: "r"(0x8000000f));
// clear overflows:
asm volatile ("MCR p15, 0, %0, c9, c12, 3\t\n" :: "r"(0x80000001));
// Select individual counter (0)
asm volatile ("MCR p15, 0, %0, c9 , c12 , 5\t\n":: "r"(0x00));
// Write event (0x11 = Cycle count)
asm volatile ("MCR p15, 0, %0, c9 , c13 , 1\t\n":: "r"(0xD));
printf("Hi");
unsigned int output;
// Read current event counter
asm volatile ("MRC p15, 0, %0, c9 , c13 , 2\t\n": "=r"(output));
printf("Event count 0: %ul\n", output);
printf("Normal Execution, No Buffer Overflow Occurred.\n");
return 0;
}
However unfortunately what you get is not only your program cpu cycle, but entire system cpu cycle!. So what I recommend is to use perf.
Write your asm code in an inline assembly code in C and then put it like this:
int dummya(int z, int b){
//This is my function you need to change it for yourself
struct perf_event_attr pe;
long long count;
int fd;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening leader %llx\n", pe.config);
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
//From here the counter starts.
asm("Your ASM Codes");
asm("Your ASM Codes");
asm("Your ASM Codes");
asm("Your ASM Codes");
asm("Your ASM Codes");
asm("Your ASM Codes");
asm("Your ASM Codes");
asm("Your ASM Codes");
//Disabling Counter
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
read(fd, &count, sizeof(long long));
printf("%lld\n", count);
close(fd);
return 5;
}
And be advised you need a new kernels to access the Perf driver.

Linux module: performance counter does not work

I want to monitor the cache request number in the last level cache. I wrote a Linux module to get that information based on the tutorial here.
It can compile and run, but the output result is always 0. In other words, when I use rdmsr, it always give me edx=0, eax=0. I even tried the demo code in the tutorial, the output is still 0.
I'm stuck at this problem for a whole week. Could anyone help me point out the mistake I made in the program?
I knew there are some existing programs doing the same thing, but I have to know how to write the code by myself, because I want to monitor the cache request in Xen hypervisor. I cannot use those tools in Xen unless I incorporate the tools into Xen's hypervisor, which seems more work.
/*
* Record the cache miss rate of Intel Sandybridge cpu
* To confirm the event is correctly set!
*/
#include <linux/module.h> /* Needed by all modules */
#include <linux/kernel.h> /* Needed for KERN_INFO */
/*4 Performance Counters Selector for %ecx in insn wrmsr*/
#define PERFEVTSEL0 0x186
#define PERFEVTSEL1 0x187
#define PERFEVTSEL2 0x188
#define PERFEVTSEL3 0x189
/*4 MSR Performance Counter for the above selector*/
#define PMC0 0xc1
#define PMC1 0xc2
#define PMC2 0xc2
#define PMC3 0xc3
/*Intel Software Developer Manual Page 2549*/ /*L1I L1D cache events has not been confirmed!*/
/*L1 Instruction Cache Performance Tuning Events*/
#define L1I_ALLHIT_EVENT 0x80
#define L1I_ALLHIT_MASK 0x01
#define L1I_ALLMISS_EVENT 0x80 /*confirmed*/
#define L1I_ALLMISS_MASK 0x02 /*confirmed*/
/*L1 Data Cache Performance Tuning Events*/
/*Intel does not have the ALLREQ Miss mask; have to add LD_miss and ST_miss*/
#define L1D_ALLREQ_EVENT 0x43
#define L1D_ALLREQ_MASK 0x01
#define L1D_LDMISS_EVENT 0x40
#define L1D_LDMISS_MASK 0x01
#define L1D_STMISS_EVENT 0x28
#define L1D_STMISS_MASK 0x01
/*L2 private cache for each core*/ /*confirmed*/
#define L2_ALLREQ_EVENT 0x24
#define L2_ALLREQ_MASK L2_ALLCODEREQ_MASK /*0xFF*/
#define L2_ALLMISS_EVENT 0x24
#define L2_ALLMISS_MASK L2_ALLCODEMISS_MASK /*0xAA*/
#define L2_ALLCODEREQ_MASK 0x30
#define L2_ALLCODEMISS_MASK 0x20
/*L3 shared cache*/ /*confirmed*/
/*Use the last level cache event and mask*/
#define L3_ALLREQ_EVENT 0x2E
#define L3_ALLREQ_MASK 0x4F
#define L3_ALLMISS_EVENT 0x2E
#define L3_ALLMISS_MASK 0x41
#define USR_BIT (0x01UL << 16)
#define OS_BIT (0x01UL << 17)
#define SET_MSR_USR_BIT(eax) eax |= USR_BIT
#define CLEAR_MSR_USR_BIT(exa) eax &= (~USR_BIT)
#define SET_MSR_OS_BIT(eax) eax |= OS_BIT
#define CLEAR_MSR_OS_BIT(eax) eax &= (~OS_BIT)
#define SET_EVENT_MASK(eax, event, umask) eax |= (event | (umask << 8))
/*MSR EN flag: when set start the counter!*/
//#define MSR_ENFLAG (0x1<<22)
#define MSR_ENFLAG (0x1<<22)
/* 32bit insn v3*/
static inline void rtxen_write_msr(uint32_t eax, uint32_t ecx)
{
/*clear counter first*/
__asm__ __volatile__ ("movl %0, %%ecx\n\t"
"xorl %%edx, %%edx\n\t"
"xorl %%eax, %%eax\n\t"
"wrmsr\n\t"
: /* no outputs */
: "m" (ecx)
: "eax", "ecx", "edx" /* all clobbered */);
eax |= MSR_ENFLAG;
__asm__("movl %0, %%ecx\n\t" /* ecx contains the number of the MSR to set */
"xorl %%edx, %%edx\n\t"/* edx contains the high bits to set the MSR to */
"movl %1, %%eax\n\t" /* eax contains the log bits to set the MSR to */
"wrmsr\n\t"
: /* no outputs */
: "m" (ecx), "m" (eax)
: "eax", "ecx", "edx" /* clobbered */);
}
static inline void rtxen_read_msr(uint32_t* ecx, uint32_t *eax, uint32_t* edx)
{ __asm__ __volatile__(\
"rdmsr"\
:"=d" (*edx), "=a" (*eax)\
:"c"(*ecx)
);
}
static inline void delay(void )
{
char tmp[1000];
int i;
for( i = 0; i < 1000; i++ )
{
tmp[i] = i * 2;
}
}
enum cache_level
{
UOPS,
L1I,
L1D,
L2,
L3
};
int init_module(void)
{
enum cache_level op;
uint32_t eax, edx, ecx;
uint64_t l3_all;
op = UOPS;
switch(op)
{
case UOPS:
eax = 0x0001010E;
eax |= MSR_ENFLAG;
ecx = 0x187;
printk(KERN_INFO "UOPS Demo: write_msr: eax=%#010x, ecx=%#010x\n", eax, ecx);
rtxen_write_msr(eax, ecx);
ecx = 0xc2;
eax = 1;
edx = 2;
rtxen_read_msr(&ecx, &eax, &edx);
printk(KERN_INFO "UOPS Demo: read_msr: edx=%#010x, eax=%#010x\n", edx, eax);
break;
case L3:
eax = 0;
SET_MSR_USR_BIT(eax);
SET_MSR_OS_BIT(eax);
SET_EVENT_MASK(eax, L3_ALLREQ_EVENT, L3_ALLREQ_MASK);
eax |= MSR_ENFLAG;
ecx = PERFEVTSEL2;
printk(KERN_INFO "before wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
rtxen_write_msr(eax, ecx);
printk(KERN_INFO "after wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
printk(KERN_INFO "L3 all request set MSR PMC2\n");
printk(KERN_INFO "delay by access an array\n");
delay();
ecx = PMC2;
eax = 1;
edx = 2;
printk(KERN_INFO "rdmsr: ecx=%#010x\n", ecx);
rtxen_read_msr(&ecx, &eax, &edx); /*need to pass into address!*/
l3_all = ( ((uint64_t) edx << 32) | eax );
printk(KERN_INFO "rdmsr: L3 all request is %llu (%#010lx)\n", l3_all, (unsigned long)l3_all);
break;
default:
printk(KERN_INFO "operation not implemented yet\n");
}
/*
* A non 0 return means init_module failed; module can't be loaded.
*/
return 0;
}
void cleanup_module(void)
{
printk(KERN_INFO "Goodbye world 1.\n");
}
The result I have is:
[ 1780.946584] UOPS Demo: write_msr: eax=0x0001010e, ecx=0x00000187
[ 1780.946590] UOPS Demo: read_msr: edx=0x00000000, eax=0x00000000
[ 1818.595055] Goodbye world 1.
[ 1821.153947] UOPS Demo: write_msr: eax=0x0041010e, ecx=0x00000187
[ 1821.153950] UOPS Demo: read_msr: edx=0x00000000, eax=0x00000000

I finally solve it with the help of #Manuel Selva!
The correct flow of setting a perf. counter is:
Step 1: set msr and enable the counter by setting the EN bit in eax;
Step 2: stop the counter by writing to msr
Step 3: read the counter
I missed the step 2, that's why it always gives me 0. It makes sense to report 0 if I want to read the counter before stopping it.
The correct code of the switch statement is as follows:
switch(op)
{
case UOPS:
eax = 0x0051010E;
eax |= MSR_ENFLAG;
ecx = 0x187;
printk(KERN_INFO "UOPS Demo: write_msr: eax=%#010x, ecx=%#010x\n", eax, ecx);
rtxen_write_msr(eax, ecx);
//stop counting
eax = 0x0011010E;
rtxen_write_msr(eax,ecx);
ecx = 0xc2;
eax = 1;
edx = 2;
rtxen_read_msr(&ecx, &eax, &edx);
printk(KERN_INFO "UOPS Demo: read_msr: edx=%#010x, eax=%#010x\n", edx, eax);
break;
case L3:
eax = 0;
SET_MSR_USR_BIT(eax);
SET_MSR_OS_BIT(eax);
SET_EVENT_MASK(eax, L3_ALLREQ_EVENT, L3_ALLREQ_MASK);
eax |= MSR_ENFLAG;
eax |= (1<<20); //INT bit: counter overflow
ecx = PERFEVTSEL2;
printk(KERN_INFO "before wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
rtxen_write_msr(eax, ecx);
printk(KERN_INFO "after wrmsr: eax=%#010x, ecx=%#010x\n", eax, ecx);
printk(KERN_INFO "L3 all request set MSR PMC2\n");
printk(KERN_INFO "delay by access an array\n");
delay();
eax &= (~MSR_ENFLAG);
rtxen_write_msr(eax, ecx);
printk(KERN_INFO "stop the counter, eax=%#010x\n", eax);
ecx = PMC2;
eax = 1;
edx = 2;
printk(KERN_INFO "rdmsr: ecx=%#010x\n", ecx);
rtxen_read_msr(&ecx, &eax, &edx); /*need to pass into address!*/
l3_all = ( ((uint64_t) edx << 32) | eax );
printk(KERN_INFO "rdmsr: L3 all request is %llu (%#010lx)\n", l3_all, (unsigned long)l3_all);
break;
default:
printk(KERN_INFO "operation not implemented yet\n");
}