Reading uncore counters through perf_event_open - c

I'm writing a program to monitor uncore events using perf_event_open. I needed to monitor the uncore_cha type, but perf event open gives an invalid argument error.
According to the perf_event_open man page, dynamic PMUs can be set in the type field of the perf_event_attr structure.
The type value can be found in /sys/bus/event_source/devices/'pmu-type'/type.
Upon setting this value, perf_event_open returns -1 and throws Error opening leader error. Upon examining the errno, it is found to be an Invalid argument error.
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
#include <linux/pmu.h>
#include <linux/hw_breakpoint.h>
long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
int
main(int argc, char **argv)
{
struct perf_event_attr pe;
long long count;
int fd;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = 25; // // Type value 25 for PMU-Type uncore_cha_0
pe.size = sizeof(struct perf_event_attr);
pe.config = 0x0111; //Event to be monitored
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening leader %llx\n", pe.config);
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
printf("Measuring instruction count for this printf\n");
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
read(fd, &count, sizeof(long long));
printf("Used %lld instructions\n", count);
close(fd);
}
Any help regarding this would be highly appreciated.

Related

perf_event reports unexpected cache miss amount: accessing N elements in an array incurs 2N cache misses

I'm trying to investigate the relationship between accessing array elements and cache misses. I wrote the following codes.
#include <asm/unistd.h>
#include <linux/perf_event.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <inttypes.h>
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags) {
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
int main(int argc, char **argv) {
struct perf_event_attr pe;
long long count;
int fd;
char *chars, c;
uint64_t n = 100000000;
int step = 64;
if (argc > 1) {
step = atoi(argv[1]);
}
chars = malloc(n * sizeof(char));
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HW_CACHE;
pe.size = sizeof(struct perf_event_attr);
pe.config = PERF_COUNT_HW_CACHE_L1D |
PERF_COUNT_HW_CACHE_OP_READ << 8 |
PERF_COUNT_HW_CACHE_RESULT_MISS << 16;
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening leader %llx\n", pe.config);
exit(EXIT_FAILURE);
}
for (size_t i = 0; i < n; i++) {
chars[i] = 1;
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
// Read from memory
for (size_t i = 0; i < n; i += step) {
c = chars[i];
}
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
read(fd, &count, sizeof(long long));
printf("%lld\n", count);
close(fd);
free(chars);
}
It is easy to know that only n/step elements in the array chars are accessed and assigned to c. I think the number of cache misses should be n/step if step is larger than the cache line (64 usually). There is no problem when step is small, i.e.,the printed number of count is about n/step. However, if step is a large number, e.g., 1000000, count is equal to about 2n/step. This has confused me for a long time. Could anyone explain this odd result?

problem in a variant of copy command using mmap()

I have a program which is another variant of copy program in linux(Actually I'm on Mac OSX).
In order to support copying large files, I wrote something like this:
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/mman.h>
#define BUFFSIZE 65535
#define PAGESIZE 4096
int main(int argc, char **argv){
char *source, *destination;
int src_fd, dst_fd;
unsigned long long bytes_read;
int bytes = BUFFSIZE;
struct timeval start, end;
int overall_time = 0;
unsigned long long offset = 0;
struct stat statbuf;
if(argc < 3){
printf("copy <source> <destination>\n");
exit(EXIT_FAILURE);
}
source = argv[1];
destination = argv[2];
src_fd = open(source, O_RDONLY, 0777);
if(src_fd < 0){
perror("src_fd");
exit(EXIT_FAILURE);
}
//bytes_read = lseek(src_fd, 0, SEEK_END);
fstat(src_fd, &statbuf);
bytes_read = statbuf.st_size;
dst_fd = open(destination, O_RDWR | O_CREAT, 0777);
if(dst_fd < 0){
perror("dst_fd");
exit(EXIT_FAILURE);
}
lseek(dst_fd, bytes_read -1, SEEK_SET);
write(dst_fd, "", 1);
gettimeofday(&start, NULL);
while(bytes_read > 0){
if(bytes_read < BUFFSIZE){
bytes = bytes_read;
bytes_read = 0;
}
else{
bytes_read -= bytes;
}
void *src_map = mmap(NULL, bytes, PROT_READ, MAP_SHARED, src_fd, (off_t)offset);
if(src_map == (void*) MAP_FAILED){
perror("src_map");
exit(EXIT_FAILURE);
}
void *dst_map = mmap(NULL, bytes, PROT_WRITE, MAP_SHARED, dst_fd, (off_t)offset);
if(dst_map == (void*) MAP_FAILED){
perror("dst_map");
exit(EXIT_FAILURE);
}
memcpy(dst_map, src_map, bytes);
int src_unmp = munmap(src_map, bytes);
if(src_unmp == -1){
perror("src_unmap");
exit(EXIT_FAILURE);
}
int dst_unmp = munmap(dst_map, bytes);
if(dst_unmp == -1){
perror("dst_unmap");
exit(EXIT_FAILURE);
}
offset += 4096;
bytes_read -= bytes;
}
gettimeofday(&end, NULL);
printf("overall = %d\n", (end.tv_usec - start.tv_usec));
close(src_fd);
close(dst_fd);
return 0;
}
The goal is to measure the amount of time elapsed to copy a large file with the use of mmap().
The above code is not working for transferring 1GB file.
Any hint for that?
Thank you
Yes. The problem is in offset value. The offset value should be a multiple of page size.

Counting CPU cycles with `perf_event` in C yields different value than `perf`

I try to count the CPU cycles of a single process via a short C code snippet. A MWE is the cpucycles.c.
cpucycles.c (heavily based on the man page example)
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
static long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
long long
cpu_cycles(pid_t pid, unsigned int microseconds)
{
struct perf_event_attr pe;
long long count;
int fd;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
fd = perf_event_open(&pe, pid, -1, -1, 0);
if (fd == -1) {
return -1;
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
usleep(microseconds);
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
read(fd, &count, sizeof(long long));
close(fd);
return count;
}
int main(int argc, char **argv)
{
printf("CPU cycles: %lld\n", cpu_cycles(atoi(argv[1]), atoi(argv[2])));
return 0;
}
Next, I compile it, set the perf_event access rights, start a process with full CPU utilization and count the CPU cycles of it via perf as well as my cpucycles.
$ gcc -o cpucycles cpucycles.c
$ echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid
$ cat /dev/urandom > /dev/null &
[1] 3214
$ perf stat -e cycles -p 3214 -x, sleep 1
3072358388,,cycles,1000577415,100,00,,,,
$ ./cpucycles 3214 1000000
CPU cycles: 287953
Obviously, only the ´3072358388´ CPU cycles from ´perf´ are correct for my 3 GHz CPU. Why is my ´cpucycles´ returning such ridicules small values?
You're excluding the kernel in your profiling when setting pe.exclude_kernel = 1;.
I just verified that by just setting that flag to 0, I get large numbers, and setting it to 1 I get small numbers.
cat /dev/urandom > /dev/null will pretty much spend all its cpu time inside the kernel. The userland bits will be a read to a buffer and the write from that buffer while all the heavy lifting in this case is done by the kernel.

How to get perf_event results for 2nd Nexus7 with Krait CPU

all.
I try to get PMUs information such as Instructions, Cycle, Cache miss and etc. on 2nd Nexus7 with Krait CPU.
The perf tool is not working correctly.
Therefore, I am using follow a sample source code in perf_event tutorials.
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
static long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
int
main(int argc, char **argv)
{
struct perf_event_attr pe;
long long count;
int fd;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening leader %llx\n", pe.config);
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
printf("Measuring Cycles for this printf\n");
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
read(fd, &count, sizeof(long long));
printf("Used %lld Cycles", count);
close(fd);
}
I tried to run this code on x86 linux machine.
The result show that "Used 123123 Cycles."
However, I couldn't get any PMUs event on 2nd nexus7.
It always returns "Used 0 Cycles."
The PMU driver is enabled as follows.
<6>[ 0.152832] hw perfevents: enabled with ARMv7 Krait PMU driver, 5 counters available
Also, I can find perf_event_msm_krait.c in the flo kernel 3.4 which is used for my Nexus7.
(I found a patch for supporting the Krait CPU; http://www.serverphorums.com/read.php?12,850329 . That includes perf_event_cpu.c file but I couldn't find it in kernel source. Is that way correct in order to support PMU for Nexus7 with Krait?)
Thank you in advance.

How to measure the 'user' time execution of a function in Linux and Windows

If I have a function foo which I would like to profile its "user" time (remove kernel or other processes time), how can I measure it in code (C/C++)?
I know of the following functions:
Windows
QueryPerformanceCounter
GetProcessTimes
Linux
gettimeofday
times
clock
Are there more ways? Each provide a different 'view' of time and non really provide accurate results.
The best way on Linux is as following: (As extracted and modified a bit from the Linux kernel perf_event_open man page)
Code:
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
int
main(int argc, char **argv)
{
struct perf_event_attr pe;
long long count;
int fd;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
pe.config = PERF_COUNT_HW_INSTRUCTIONS;
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening leader %llx\n", pe.config);
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
printf("Measuring instruction count for this printf\n");
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
read(fd, &count, sizeof(long long));
printf("Used %lld instructions\n", count);
close(fd);
}
On Unix-like systems getrusage is what you're looking for. Specifically with the RUSAGE_SELF option. The user time will be in the ru_utime field in struct rusage. ru_stime counts system time.

Resources