Oops while traversing sk_write_queue in Linux kernel - c

My Linux 3.8.8 kernel would Oops while traversing tcp sk_write_queue in kernel, like below:
BUG: unable to handle kernel paging request at 00b85055
My code logic in kernel is as below:
Get struct socket from socket file descriptor;
Get struct sock from struct socket;
lock_sock();
Get sk_write_queue from struct sock;
traverse sk_write_queue, read linked skb buffer;
release_sock();
return to user space.
While in step (5), sometimes will trigger Oops above, the problem code is:
skb_is_nonlinear(skb);
It means that skb pointer now is invalid or null, can't the lock_sock protect the traverse process?
Code:
asmlinkage long sys_tcp_get(int fd, int *id)
{
int err, i, j;
struct socket *sock;
struct sock *sk;
struct sk_buff_head *queue;
struct sk_buff *skb;
unsigned char buf[4] = {0};
sock = sockfd_lookup(fd, &err);
if (sock == NULL) return -1;
sk = sock->sk;
if (sk == NULL) {
err = -1;
goto free_socket;
}
lock_sock(sk);
if (((1 << sk->sk_state) & ~TCPF_ESTABLISHED) ||
(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) {
err = -1;
goto release_socket;
}
queue = &(sk->sk_write_queue);
if (tcp_send_head(sk) == NULL) {
err = -1;
goto release_socket;
}
for(skb = tcp_send_head(sk); skb != (struct sk_buff *)(queue); skb = skb->next) {
if (skb_is_nonlinear(skb)) {
err = -1;
goto release_socket;
}
i = 0;
j = 0;
while(i < skb->len) {
buf[j++] = skb->data[i++];
if (j == 4) {
if (buf[0] == 1 && buf[1] == 2 && buf[2] == 3) {
*id = buf[3];
err = 0;
goto release_socket;
} else {
j = 0;
}
}
}
}
release_socket:
release_sock(sk);
free_socket:
sockfd_put(sock);
return err;
}

Related

VFIO interrupts using eventfd: can eventfd semaphore behaviour be maintained?

I have a program running on a QEMU VM. The program running inside this VM gets notified by a program on the host via interrupts and using QEMU ivshmem. The program on the host creates an eventfd and sends this file descriptor to QEMU when the VM starts. The program in the guest then opens a VFIO group device and sets an interrupt request fd on this device. We can then add the interrupt fd to epoll and epoll_wait to wait for notifications from the host.
The thing is that I want a 1-1 matching between the times the host writes to the eventfd and the number of events that are signaled in epoll_wait. For this I decided to use EFD_SEMAPHORE for the evenfds on the host and the guest. From my understanding, every time I write an 8 byte integer with value 1, the eventfd_counter is incremented by 1. Then every time the eventfd is read, the counter is decremented by 1 (different from a regular eventfd where each read clears the whole counter). For some reason, I am not getting the desired behaviour, so I was wondering if either eventfds with the EFD_SEMAPHORE flags are not properly supported by VFIO or QEMUs ivshmem.
Below is a simplified version of the parts I think are relevant and how I setup the notification system. I hope the code below is not too verbose. I tried to reduce the number of irrelevant parts (there is too much other code in the middle that is not particularly relevant to the problem) but not 100% sure what might be relevant or not.
Code host uses to signal guest
int ivshmem_uxsocket_send_int(int fd, int64_t i)
{
int n;
struct iovec iov = {
.iov_base = &i,
.iov_len = sizeof(i),
};
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
.msg_iov = &iov,
.msg_iovlen = 1,
.msg_control = NULL,
.msg_controllen = 0,
.msg_flags = 0,
};
if ((n = sendmsg(fd, &msg, 0)) != sizeof(int64_t))
{
return -1;
}
return n;
}
int ivshmem_uxsocket_sendfd(int uxfd, int fd, int64_t i)
{
int n;
struct cmsghdr *chdr;
/* Need to pass at least one byte of data to send control data */
struct iovec iov = {
.iov_base = &i,
.iov_len = sizeof(i),
};
/* Allocate a char array but use a union to ensure that it
is aligned properly */
union {
char buf[CMSG_SPACE(sizeof(fd))];
struct cmsghdr align;
} cmsg;
memset(&cmsg, 0, sizeof(cmsg));
/* Add control data (file descriptor) to msg */
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
.msg_iov = &iov,
.msg_iovlen = 1,
.msg_control = &cmsg,
.msg_controllen = sizeof(cmsg),
.msg_flags = 0,
};
/* Set message header to describe ancillary data */
chdr = CMSG_FIRSTHDR(&msg);
chdr->cmsg_level = SOL_SOCKET;
chdr->cmsg_type = SCM_RIGHTS;
chdr->cmsg_len = CMSG_LEN(sizeof(int));
memcpy(CMSG_DATA(chdr), &fd, sizeof(fd));
if ((n = sendmsg(uxfd, &msg, 0)) != sizeof(i))
{
return -1;
}
return n;
}
/* SETUP IVSHMEM WITH QEMU AND PASS THE EVENTFD USED TO
NOTIFY THE GUEST */
int ivshmem_uxsocket_accept()
{
int ret;
int cfd, ifd, nfd;
int64_t version = IVSHMEM_PROTOCOL_VERSION;
uint64_t hostid = HOST_PEERID;
int vmid = 0
/* Accept connection from qemu ivshmem */
if ((cfd = accept(uxfd, NULL, NULL)) < 0)
{
return -1;
}
/* Send protocol version as required by qemu ivshmem */
ret = ivshmem_uxsocket_send_int(cfd, version);
if (ret < 0)
{
return -1;
}
/* Send vm id to qemu */
ret = ivshmem_uxsocket_send_int(cfd, vmid);
if (ret < 0)
{
return -1;
}
/* Send shared memory fd to qemu */
ret = ivshmem_uxsocket_sendfd(cfd, shm_fd, -1);
if (ret < 0)
{
return -1;
}
/* Eventfd used by guest to notify host */
if ((nfd = eventfd(0, EFD_SEMAPHORE | EFD_NONBLOCK)) < 0)
{
return -1;
}
/* Ivshmem protocol requires to send host id
with the notify fd */
ret = ivshmem_uxsocket_sendfd(cfd, nfd, hostid);
if (ret < 0)
{
return -1;
}
/* THIS IS THE EVENTFD OF INTEREST TO US: USED BY HOST
TO NOTIFY GUEST */
if ((ifd = eventfd(0, EFD_SEMAPHORE | EFD_NONBLOCK)) < 0)
{
return -1;
}
ret = ivshmem_uxsocket_sendfd(cfd, ifd, vmid);
if (ret < 0)
{
return -1;
}
if (epoll_ctl(epfd, EPOLL_CTL_ADD, cfd, &ev) < 0)
{
return -1;
}
return 0;
}
/* NOW EVERY TIME WE WANT TO NOTIFY THE GUEST
WE CALL THE FOLLOWING FUNCTION */
int notify_guest(int fd)
{
int ret;
uint64_t buf = 1;
ret = write(fd, &buf, sizeof(uint64_t));
if (ret < sizeof(uint64_t))
{
return -1;
}
return 0;
}
Code guest uses to receive notifications from host
/* THIS FUNCTION SETS THE IRQ THAT RECEIVES THE
NOTIFICATIONS FROM THE HOST */
int vfio_set_irq(int dev)
{
int fd;
struct vfio_irq_set *irq_set;
char buf[sizeof(struct vfio_irq_set) + sizeof(int)];
if ((fd = eventfd(0, EFD_SEMAPHORE | EFD_NONBLOCK)) < 0)
{
return -1;
}
irq_set = (struct vfio_irq_set *) buf;
irq_set->argsz = sizeof(buf);
irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
irq_set->index = 2;
irq_set->start = 0;
irq_set->count = 1;
memcpy(&irq_set->data, &fd, sizeof(int));
if (ioctl(dev, VFIO_DEVICE_SET_IRQS, irq_set) < 0)
{
return -1;
}
return irq_fd;
}
/* The guest sets up the ivshmem region from QEMU and sets the
interrupt request. */
int vfio_init()
{
int cont, group, irq_fd;
struct epoll_event ev;
struct vfio_group_status g_status = { .argsz = sizeof(g_status) };
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
/* Create vfio container */
if ((cont = open("/dev/vfio/vfio", O_RDWR)) < 0)
{
return -1;
}
/* Check API version of container */
if (ioctl(cont, VFIO_GET_API_VERSION) != VFIO_API_VERSION)
{
return -1;
}
if (!ioctl(cont, VFIO_CHECK_EXTENSION, VFIO_NOIOMMU_IOMMU))
{
return -1;
}
/* Open the vfio group */
if((group = open(VFIO_GROUP, O_RDWR)) < 0)
{
return -1;
}
/* Test if group is viable and available */
ioctl(group, VFIO_GROUP_GET_STATUS, &g_status);
if (!(g_status.flags & VFIO_GROUP_FLAGS_VIABLE))
{
return -1;
}
/* Add group to container */
if (ioctl(group, VFIO_GROUP_SET_CONTAINER, &cont) < 0)
{
return -1;
}
/* Enable desired IOMMU model */
if (ioctl(cont, VFIO_SET_IOMMU, VFIO_NOIOMMU_IOMMU) < 0)
{
return -1;
}
/* Get file descriptor for device */
if ((dev = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, VFIO_PCI_DEV)) < 0)
{
return -1;
}
/* Get device info */
if (ioctl(dev, VFIO_DEVICE_GET_INFO, &device_info) < 0)
{
return -1;
}
/* Set interrupt request fd */
if ((irq_fd = vfio_set_irq(dev)) < 0)
{
return -1
}
/* Add interrupt request fd to interest list */
if (vfio_subscribe_irq() < 0)
{
return -1;
}
/* Do other shm setup stuff not related to the interrupt
request */
ev.events = EPOLLIN;
ev.data.ptr = EP_NOTIFY;
ev.data.fd = irq_fd;
if (epoll_ctl(epfd, EPOLL_CTL_ADD, irq_fd, &ev) != 0)
{
return -1;
}
return 0;
}
int ivshmem_drain_evfd(int fd)
{
int ret;
uint64_t buf;
ret = read(fd, &buf, sizeof(uint64_t));
if (ret == 0)
{
return -1;
}
return ret;
}
/* I should get every notification from the host here,
but it seems that not all notifications are going
through. The number of calls to notify_guest does not
match the number of events received from epoll_wait
here */
int notify_poll()
{
int i, n;
struct epoll_event evs[32];
n = epoll_wait(epfd, evs, 32, 0);
for (i = 0; i < n; i++)
{
if (evs[i].events & EPOLLIN)
{
/* Drain evfd */
drain_evfd(irq_fd);
/* Handle notification ... */
handle();
}
}
}

Separate connect function for libmodbus

I am trying to group the operation under libmodbus for Mod-bus connection and get-value into two simpler function as below.
However, it always cause Segmentation fault (core dumped) when I try to get value from the device.(get_float, modbus_read_registers)
Can anyone tell me how to fix it?
int connect(char *ip_addr, struct timeval timeout, modbus_t *ctx){
int fail = 0;
ctx = modbus_new_tcp(ip_addr, MODBUS_SERVER_PORT);
modbus_set_slave(ctx, MODBUS_DEVICE_ID);
modbus_set_debug(ctx, MODBUS_DEBUG);
timeout.tv_sec = MODBUS_TIMEOUT_SEC;
timeout.tv_usec = MODBUS_TIMEOUT_USEC;
modbus_get_byte_timeout(ctx, &timeout.tv_sec, &timeout.tv_usec);
timeout.tv_sec = MODBUS_TIMEOUT_SEC;
timeout.tv_usec = MODBUS_TIMEOUT_USEC;
modbus_set_response_timeout(ctx, timeout.tv_sec, timeout.tv_usec);
fail = modbus_connect(ctx);
if (fail == -1) {
fprintf(stderr, "Connection failed: %s\n",
modbus_strerror(errno));
modbus_free(ctx);
fail = -1;
}
return fail;
}
int get_float(modbus_t *ctx, uint16_t addr, float *val){
int fail = 0;
__uint16_t value[2];
printf("1\n");
fail = modbus_read_registers(ctx, (addr-1), 2, value);
printf("2\n");
if(fail <= 0) {
fprintf(stderr, "Reading error(%d): %s\n", addr, modbus_strerror(errno));
} else {
*val = modbus_get_float_abcd(value);
}
return fail;
}
Besides, I can successfully run the similar code when I put them in same function as below:
int connect_n_getFloat(char *ip_addr, uint16_t addr, float *val){
int fail = 0;
modbus_t *ctx = modbus_new_tcp(ip_addr, MODBUS_SERVER_PORT);
ctxConfig(ctx);
if (modbus_connect(ctx) == 0) {
__uint16_t value[2];
if(modbus_read_registers(ctx, (addr-1), 2, value) > 0) {
*val = modbus_get_float_abcd(value);
} else {
fprintf(stderr, "Reading error(%d): %s\n", addr, modbus_strerror(errno));
fail = -1;
}
} else {
fprintf(stderr, "Connection failed: %s\n",
modbus_strerror(errno));
modbus_free(ctx);
fail = -1;
}
return fail;
}
You're passing a context pointer to the connect function, but should be passing a pointer to a pointer, so you can return the allocated context and continue using it in further calls.
Change the function signature, and ctx usage, from
int connect(char *ip_addr, struct timeval timeout, modbus_t *ctx) {
int fail = 0;
ctx = modbus_new_tcp(ip_addr, MODBUS_SERVER_PORT);
to
int connect(char *ip_addr, struct timeval timeout, modbus_t **ctx) {
int fail = 0;
*ctx = modbus_new_tcp(ip_addr, MODBUS_SERVER_PORT);
This also explains why it works when you put them in the same function.

Why isn't the Kernel receveing my generic netlink messages?

I'm trying to send nested attributes from user space to kernel using generic netlink, the function nl_send_auto() returns 52 (which was supposed to be the numbers of bytes sent to kernel) but the kernel isn't receiving the messages. Is there some problem with my approach? Here is the code that I wrote on user space:
int err = -1;
struct nl_msg *msg;
struct nlattr *attr;
struct nl_sock *sock;
int family;
int send = 0;
if ((sock = nl_socket_alloc()) == NULL)
return err;
if ((err = genl_connect(sock)))
return err;
if ((family = genl_ctrl_resolve(sock, FAMILY)) < 0)
return family;
if ((msg = nlmsg_alloc()) == NULL)
return err;
if ((genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, FAMILY, 0,
NLM_F_REQUEST, CREATE_STATE, 1)) == NULL)
return err;
if (!(attr = nla_nest_start(msg, KLUA_NL_STATE))){
nla_nest_cancel(msg, attr);
return err;
}
if ((ret = nla_put_string(msg, STATE_NAME, cmd->name)) ||
(ret = nla_put_u32(msg, MAX_ALLOC, cmd->maxalloc)) ||
(ret = nla_put_u32(msg, CURR_ALLOC, cmd->curralloc))
)
return err;
nla_nest_end(msg, attr);
if ((send = nl_send_auto(ctrl->sock, msg)) < 0)
return send;
printf("All done sended %d bytes\n", send);
nlmsg_free(msg);
This code prints 52, which is the bytes sent to kernel;
The FAMILY macro is defined as (both in kernel and user space):
#define FAMILY "family"
My netlink attributes are (both for kernel and user space):
enum {
KLUA_NL_STATE,
STATE_NAME,
MAX_ALLOC,
CURR_ALLOC,
ATTR_COUNT,
#define ATTR_MAX (ATTR_COUNT - 1)
};
My enum for operation is:
enum {
CREATE_STATE = 16,
};
And my kernel code is:
struct nla_policy lunatik_policy[ATTR_MAX] = {
[KLUA_NL_STATE] = { .type = NLA_NESTED },
};
static int klua_create_state(struct sk_buff *buff, struct genl_info *info);
static const struct genl_ops l_ops[] = {
{
.cmd = CREATE_STATE,
.doit = klua_create_state,
#if LINUX_VERSION_CODE < KERNEL_VERSION(5,2,0)
/*Before kernel 5.2.0, each operation has its own policy*/
.policy = lunatik_policy
#endif
},
};
#define KLUA_NL_STATE_ATTRS_COUNT 3
struct genl_family lunatik_family = {
.name = FAMILY,
.version = 1,
.maxattr = ATTR_MAX,
.netnsok = true,
.policy = lunatik_policy,
.module = THIS_MODULE,
.ops = l_ops,
.n_ops = ARRAY_SIZE(l_ops),
};
static int klua_create_state(struct sk_buff *buff, struct genl_info *info)
{
pr_info("I received the message\n");
return 0;
}
This code doesn't print anything on dmesg, and I would like to know why.
You actual problems
During Linux 5.2 refactors, the semantics of the NLA_F_NESTED flag changed somewhat. It appears you now need to always include it when you call nla_nest_start():
if (!(attr = nla_nest_start(msg, KLUA_NL_STATE))){
...
}
Should be
if (!(attr = nla_nest_start(msg, NLA_F_NESTED | KLUA_NL_STATE))){
...
}
Yes, I'm well aware the libnl library should obviously do this for you, and will perhaps do so in the future, but unfortunately this is where we are now.
Also:
enum {
KLUA_NL_STATE,
...
};
Attribute zero is always reserved. You need to change that into
enum {
KLUA_NL_STATE = 1,
...
};
Just FYI: operation zero is also reserved, so it's fortunate that you chose 16. But do keep it in mind in the future.
Syntactic issues
These are probably just copy-paste errors, but I'm including them anyway for the benefit on other people landing in this page looking for examples.
if ((genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, FAMILY, 0,
NLM_F_REQUEST, CREATE_STATE, 1)) == NULL)
return err;
Should be
if ((genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, family, 0,
NLM_F_REQUEST, CREATE_STATE, 1)) == NULL)
return err;
Also:
if ((ret = nla_put_string(msg, STATE_NAME, cmd->name)) ||
(ret = nla_put_u32(msg, MAX_ALLOC, cmd->maxalloc)) ||
(ret = nla_put_u32(msg, CURR_ALLOC, cmd->curralloc))
)
return err;
Should be
if ((err = nla_put_string(msg, STATE_NAME, cmd->name)) ||
(err = nla_put_u32(msg, MAX_ALLOC, cmd->maxalloc)) ||
(err = nla_put_u32(msg, CURR_ALLOC, cmd->curralloc))
)
return err;
Also:
if ((send = nl_send_auto(ctrl->sock, msg)) < 0)
return send;
Should be
if ((send = nl_send_auto(sock, msg)) < 0)
return send;

C: Remaining bytes on AF_UNIX socket

I got some problems with AF_UNIX socket communication because after writing a data buffer there seem to remain some hanging bytes to read that I do not know where they come from.
I am writing a multithreaded server program in C that communicates with clients through AF_UNIX sockets, it must implement a simple chatroom. Among other things, the server must implement file transfers between clients and servers and i encountered problems when i try to send a quite large file (269K) from Server to client. (With smaller files i do not have any problems)
For file transfer i use mmap() function which return a pointer to the map of the file I want to send, then i use write() for write that data on socket linked with the client that must recieve the file.
After write() call i check the returned value to be equal than the file size. (always verified)
The client, after receiving the file, check the size of read data (always verified) and start waiting for other messages so it call a blocking read(). This is the point where I found the error because the client reads something that should not be there, as if there was something left to read on the socket.
I've been debugging this part (both server and client) for two days and I have not yet been able to understand the origin of the problem.
I am sure that no other thread write on the same socket at the same time
Does any of you have an idea of what the cause of this error is?
I try to post some useful code thinking at a normal operation sequence:
First of all message structure:
struct message_hdr
{
op_t op;
char sender[MAX_NAME_LENGTH+1];
};
struct message_data_hdr{
char receiver[MAX_NAME_LENGTH+1];
unsigned int len;
};
struct message_data
{
message_data_hdr_t hdr;
char *buf;
};
struct message
{
message_hdr_t hdr;
message_data_t data;
};
A server->client file transfer starts with server that send a message_hdr_t to a client which is waiting on a read() (the client expects to receive only a message_hdr_t).
int sendHeader(long fd, message_hdr_t* hdr)
{
if(hdr == NULL || fd < 0) {errno = EINVAL; return -1;}
int test;
struct iovec iov;
iov.iov_base = hdr;
iov.iov_len = sizeof(message_hdr_t);
test = writev(fd, &iov, 1);
return test;
}
The client understands from the operation code (message.hdr.op) that it is a file type message and it begins to wait for file,
So server send it:
int sendData(long fd, message_data_t *msg)
{
if(msg == NULL || fd < 0) {errno = EINVAL; return -1;}
int test;
struct iovec iov;
iov.iov_base = &(msg->hdr);
iov.iov_len = sizeof(message_data_hdr_t);
test = writev(fd, &(iov), 1);
if(test == -1){return -1;}
if (msg->hdr.len != 0)
{
test = write(fd, msg->buf, msg->hdr.len);
if(test <= 0)
return -1;
}
return test;
}
And client read it:
int readData(long fd, message_data_t *data)
{
if(data == NULL || fd < 0) {errno = EINVAL; return -1;}
int test;
struct iovec iov;
iov.iov_base = &(data->hdr);
iov.iov_len = sizeof(message_data_hdr_t);
test = readv(fd, &iov, 1);
if(test <= 0){return -1;}
if(data->hdr.len != 0)
{
data->buf = malloc(data->hdr.len);
if(data->buf == NULL){return -1;}
test = read(fd, data->buf, data->hdr.len);
if((unsigned int)test != data->hdr.len)
return -1;
}
return test;
}
At this point the client recived file, and it restart waiting for new messages:
int readMsg(long fd, message_t *msg)
{
if(msg == NULL || fd < 0) {errno = EINVAL; return -1;}
int test;
test = readHeader(fd, &(msg->hdr));
if(test == -1 || test == 0){return -1;}
test += readData(fd, &(msg->data));
return test;
}
This is the point where the client should simply wait because there is no income messages, insted in this case it read something that I do not know where it comes from.
When i try to print this unwanted message with GDB it prints:
{hdr = {op = 512,
sender = "\000\000\020G\032\324\t\000\000\n\000\000\000\000\030\021B\bC\n\000\000\v\000\000\000\000\021D\v\222\000"},
data = {hdr = {receiver = "\000\000\000\000\021E\022C\n\000\000\b\v\000\000\000\000\021F\020I\n\000\000\020\000\006\b\002\n\000\000\006",
len = 131072},
buf = 0x7ffff7f2f010 ""}`
Of course this is meaningless.
I hope that this description will be useful
Thank you all in advance.
Ok, I solved my issue.
As written in the comment, this problem was due to the lack of a check on partial writing.
Now the function readData() looks like this:
int readData(long fd, message_data_t *data)
{
if(data == NULL || fd < 0) {errno = EINVAL; return -1;}
int test;
char* ph;
unsigned int rd = 0;
struct iovec iov;
iov.iov_base = &(data->hdr);
iov.iov_len = sizeof(message_data_hdr_t);
test = readv(fd, &iov, 1);
if(test <= 0){return -1;}
if(data->hdr.len != 0)
{
data->buf = malloc(data->hdr.len);
if(data->buf == NULL){return -1;}
ph = data->buf;
while (rd < data->hdr.len)
{
test = read(fd, ph, data->hdr.len - rd);
if(test == -1)
return -1;
else if(test == 0)
{
errno = ENOENT;
return -1;
}
rd += test;
ph += test;
}
}
return rd;
}
and sendData():
int sendData(long fd, message_data_t *msg)
{
if(msg == NULL || fd < 0) {errno = EINVAL; return -1;}
int test;
char* ph;
unsigned int wr = 0;
struct iovec iov;
iov.iov_base = &(msg->hdr);
iov.iov_len = sizeof(message_data_hdr_t);
test = writev(fd, &(iov), 1);
if(test == -1){return -1;}
if(msg->hdr.len != 0)
{
ph = msg->buf;
while (wr < msg->hdr.len)
{
test = write(fd, ph, msg->hdr.len - wr);
if(test == -1)
return -1;
else if(test == 0)
{
errno = ENOENT;
return -1;
}
wr += test;
ph += test;
}
}
return test;
}
In this way I no longer found the error.
Thanks for the help!

DPDK create a packet for transmission

I am new to DPDK and trying to create a packet to send it from one DPDK enabled machine to another connected directly via an ethernet. I modified an example/rxtx_callbacks/main.c provided with DPDK at both side. However, I am not receiving anything at the receiver. What wrong am I doing?
Modified function at transmitter: lcore_main is modified:
static __attribute__((noreturn)) void lcore_main()
{
uint16_t port;
struct ether_hdr *eth_hdr;
struct ether_addr daddr;
daddr.addr_bytes[0] = 116;
daddr.addr_bytes[1] = 225;
daddr.addr_bytes[2] = 228;
daddr.addr_bytes[3] = 204;
daddr.addr_bytes[4] = 106;
daddr.addr_bytes[5] = 82;
//rte_eth_macaddr_get(portid, &addr);
struct ipv4_hdr *ipv4_hdr;
int32_t i;
int ret;
RTE_ETH_FOREACH_DEV(port)
if (rte_eth_dev_socket_id(port) > 0 &&
rte_eth_dev_socket_id(port) !=
(int)rte_socket_id())
printf("WARNING, port %u is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n", port);
printf("\nCore %u forwarding packets. [Ctrl+C to quit]\n",
rte_lcore_id());
//struct rte_mbuf *m_head = rte_pktmbuf_alloc(mbuf_pool);
struct rte_mbuf *m_head[BURST_SIZE];
for (;;) {
RTE_ETH_FOREACH_DEV(port) {
if(rte_pktmbuf_alloc_bulk(mbuf_pool, m_head, BURST_SIZE)!=0)
{
printf("Allocation problem\n");
}
for(i = 0; i < BURST_SIZE; i++) {
eth_hdr = rte_pktmbuf_mtod(m_head[i], struct ether_hdr *);
//eth_hdr = (struct ether_hdr *)rte_pktmbuf_append(m_head[i],
// sizeof(struct ether_hdr));
eth_hdr->ether_type = htons(ETHER_TYPE_IPv4);
rte_memcpy(&(eth_hdr->s_addr), &addr, sizeof(struct ether_addr));
rte_memcpy(&(eth_hdr->d_addr), &daddr, sizeof(struct ether_addr));
}
const uint16_t nb_tx = rte_eth_tx_burst(port, 0, m_head, BURST_SIZE);
if (unlikely(nb_tx < BURST_SIZE)) {
uint16_t buf;
for (buf = nb_tx; buf < BURST_SIZE; buf++)
rte_pktmbuf_free(m_head[buf]);
}
}
}
}
receiver side RTE_ETH_FOREACH_DEV of tx part is modified to:
RTE_ETH_FOREACH_DEV(port) {
struct rte_mbuf *bufs[BURST_SIZE];
const uint16_t nb_rx = rte_eth_rx_burst(port, bufs, BURST_SIZE);
//printf("Number of Packets received %d\n", nb_rx);
for(i = 0; i < nb_rx; i++) {
//ipv4_hdr = rte_pktmbuf_mtod_offset(bufs[i], struct ipv4_hdr *,
// sizeof(struct ether_hdr));
//printf("Packet ip received %d\n", ipv4_hdr->src_addr);
eth_hdr = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
printf("Packet ip received %d\n", eth_hdr->ether_type);
}
if (unlikely(nb_rx == 0))
continue;
const uint16_t nb_tx = 0; // = rte_eth_tx_burst(port ^ 1, 0, bufs, nb_rx);
if (unlikely(nb_tx < nb_rx)) {
uint16_t buf;
for (buf = nb_tx; buf < nb_rx; buf++)
rte_pktmbuf_free(bufs[buf]);
}
}
Please let me know if I missed something.
There are few issues with the code:
eth_hdr = rte_pktmbuf_mtod(m_head[i], struct ether_hdr *);
Unlike rte_pktmbuf_append(), the rte_pktmbuf_mtod() does not change the packet length, so it should be set manually before the tx.
eth_hdr->ether_type = htons(ETHER_TYPE_IPv4);
If we set ETHER_TYPE_IPv4, a correct IPv4 header must follow. So we need either to add the header or to change the ether_type.
rte_memcpy(&(eth_hdr->s_addr), &addr, sizeof(struct ether_addr));
Where is the source address comes from?
const uint16_t nb_tx = rte_eth_tx_burst(port, 0, m_head, BURST_SIZE);
Looks like we transmit a burst of zero-sized packets with invalid IPv4 headers. Please also make sure the source/destination addresses are correct.
As suggested by #andriy-berestovsky, I used rte_eth_stats_get() and it shows packets are present in ethernet ring via the field ipackets but rte_eth_rx_burst is not returning any packets. Full code is included here, please let me know what I am doing wrong. (I am using testpmd at transmitter side)
#include <stdint.h>
#include <inttypes.h>
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_ether.h>
#include <rte_cycles.h>
#include <rte_lcore.h>
#include <rte_ip.h>
#include <rte_mbuf.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <signal.h>
#define MAX_SOURCE_SIZE (0x100000)
#define RX_RING_SIZE 1024
#define TX_RING_SIZE 1024
#define NUM_MBUFS 8191
#define MBUF_CACHE_SIZE 250
#define BURST_SIZE 32
static const struct rte_eth_conf port_conf_default = {
.rxmode = {
.max_rx_pkt_len = ETHER_MAX_LEN,
},
};
static struct {
uint64_t total_cycles;
uint64_t total_pkts;
} latency_numbers;
static volatile bool force_quit;
struct rte_mempool *mbuf_pool;
static void
signal_handler(int signum)
{
struct rte_eth_stats eth_stats;
int i;
if (signum == SIGINT || signum == SIGTERM) {
printf("\n\nSignal %d received, preparing to exit...\n",
signum);
RTE_ETH_FOREACH_DEV(i) {
rte_eth_stats_get(i, &eth_stats);
printf("Total number of packets received %llu, dropped rx full %llu and rest= %llu, %llu, %llu\n", eth_stats.ipackets, eth_stats.imissed, eth_stats.ierrors, eth_stats.rx_nombuf, eth_stats.q_ipackets[0]);
}
force_quit = true;
}
}
struct ether_addr addr;
/*
* Initialises a given port using global settings and with the rx buffers
* coming from the mbuf_pool passed as parameter
*/
static inline int
port_init(uint16_t port, struct rte_mempool *mbuf_pool)
{
struct rte_eth_conf port_conf = port_conf_default;
const uint16_t rx_rings = 1, tx_rings = 1;
uint16_t nb_rxd = RX_RING_SIZE;
uint16_t nb_txd = TX_RING_SIZE;
int retval;
uint16_t q;
struct rte_eth_dev_info dev_info;
struct rte_eth_txconf txconf;
if (!rte_eth_dev_is_valid_port(port))
return -1;
rte_eth_dev_info_get(port, &dev_info);
if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
port_conf.txmode.offloads |=
DEV_TX_OFFLOAD_MBUF_FAST_FREE;
retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
if (retval != 0)
return retval;
retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &nb_rxd, &nb_txd);
if (retval != 0) {
printf("Error in adjustment\n");
return retval;
}
for (q = 0; q < rx_rings; q++) {
retval = rte_eth_rx_queue_setup(port, q, nb_rxd,
rte_eth_dev_socket_id(port), NULL, mbuf_pool);
if (retval < 0) {
printf("RX queue setup prob\n");
return retval;
}
}
txconf = dev_info.default_txconf;
txconf.offloads = port_conf.txmode.offloads;
for (q = 0; q < tx_rings; q++) {
retval = rte_eth_tx_queue_setup(port, q, nb_txd,
rte_eth_dev_socket_id(port), &txconf);
if (retval < 0)
return retval;
}
retval = rte_eth_dev_start(port);
if (retval < 0) {
printf("Error in start\n");
return retval;
}
rte_eth_macaddr_get(port, &addr);
printf("Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
(unsigned)port,
addr.addr_bytes[0], addr.addr_bytes[1],
addr.addr_bytes[2], addr.addr_bytes[3],
addr.addr_bytes[4], addr.addr_bytes[5]);
rte_eth_promiscuous_enable(port);
return 0;
}
/*
* Main thread that does the work, reading from INPUT_PORT
* and writing to OUTPUT_PORT
*/
static __attribute__((noreturn)) void
lcore_main(void)
{
uint16_t port;
struct ether_hdr *eth_hdr;
//struct ether_addr addr;
//rte_eth_macaddr_get(portid, &addr);
struct ipv4_hdr *ipv4_hdr;
int32_t i;
RTE_ETH_FOREACH_DEV(port)
{
if (rte_eth_dev_socket_id(port) > 0 &&
rte_eth_dev_socket_id(port) !=
(int)rte_socket_id())
printf("WARNING, port %u is on remote NUMA node to "
"polling thread.\n\tPerformance will "
"not be optimal.\n", port);
}
printf("\nCore %u forwarding packets. [Ctrl+C to quit]\n",
rte_lcore_id());
for (;;) {
RTE_ETH_FOREACH_DEV(port) {
struct rte_mbuf *bufs[BURST_SIZE];
const uint16_t nb_rx = rte_eth_rx_burst(port, 0,bufs, BURST_SIZE);
for(i = 0; i < nb_rx; i++) {
ipv4_hdr = rte_pktmbuf_mtod_offset(bufs[i], struct ipv4_hdr *, sizeof(struct ether_hdr));
printf("Packet ip received %d\n", ipv4_hdr->src_addr);
}
if (unlikely(nb_rx == 0))
continue;
const uint16_t nb_tx = 0; // = rte_eth_tx_burst(port ^ 1, 0, bufs, nb_rx);
if (unlikely(nb_tx < nb_rx)) {
uint16_t buf;
for (buf = nb_tx; buf < nb_rx; buf++)
rte_pktmbuf_free(bufs[buf]);
}
}
if(force_quit)
break;
}
}
/* Main function, does initialisation and calls the per-lcore functions */
int
main(int argc, char *argv[])
{
uint16_t nb_ports;
uint16_t portid, port;
/* init EAL */
int ret = rte_eal_init(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
argc -= ret;
argv += ret;
force_quit = false;
signal(SIGINT, signal_handler);
signal(SIGTERM, signal_handler);
nb_ports = rte_eth_dev_count_avail();
printf("size ordered %lld\n", NUM_MBUFS *nb_ports);
mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
NUM_MBUFS * nb_ports, MBUF_CACHE_SIZE, 0,
RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
if (nb_ports < 1)
rte_exit(EXIT_FAILURE, "Error: number of ports must be greater than %d\n", nb_ports);
if (mbuf_pool == NULL)
rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
// initialize all ports
RTE_ETH_FOREACH_DEV(portid)
if (port_init(portid, mbuf_pool) != 0)
rte_exit(EXIT_FAILURE, "Cannot init port %"PRIu8"\n",
portid);
if (rte_lcore_count() > 1)
printf("\nWARNING: Too much enabled lcores - "
"App uses only 1 lcore\n");
// call lcore_main on master core only
lcore_main();
return 0;
}
It seems to be a problem of ethernet card with ubuntu 14.04. With ubuntu 16.04 it is working fine.

Resources