Netfilter hook stateful connection packet filtering - c

I am writing a Netfilter hook and want to do a stateful analysis of incoming TCP packets, whether they belong to an existing connection or a new connection is starting.
This is my first try at writing code using Netfilter and after reading https://people.netfilter.org/pablo/docs/login.pdf I understand I need to check if a packet is categorized as a NEW or ESTABLISHED state. But I cannot find any documentation of how to write code for this.
static unsigned int hfunc(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) {
struct iphdr *iph;
struct udphdr *udph;
if (!skb)
return NF_ACCEPT;
iph = ip_hdr(skb);
if (iph->protocol == IPPROTO_TCP) {
/*
if packet SYN flag is enabled and state==NEW:
return NF_ACCEPT
else if SYN flag is disabled and state==NEW:
return NF_DROP
*/
}
return NF_ACCEPT
}
static int __init my_net_module_init(void) {
printk(KERN_INFO "Initializing my netfilter module\n");
// Allocating memory for hook structure.
my_nf_hook = (struct nf_hook_ops*) kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL);
// Constructing the structure
my_nf_hook->hook = (nf_hookfn*)hfunc; /* hook function */
my_nf_hook->hooknum = NF_INET_PRE_ROUTING; /* received packets */
my_nf_hook->pf = PF_INET; /* IPv4 */
my_nf_hook->priority = NF_IP_PRI_FIRST; /* max hook priority */
nf_register_net_hook(&init_net, my_nf_hook);
return 0;
}
static void __exit my_net_module_exit(void) {
nf_unregister_net_hook(&init_net, my_nf_hook);
kfree(my_nf_hook);
printk(KERN_INFO "Exiting my netfilter module\n");
}
module_init(my_net_module_init);
module_exit(my_net_module_exit);
Edit:
Added code snippet for registering hook in pre-routing.

Seems that in your hook you want to make a decision on packet based on conntrack(CT) info about the connection state - to block (drop) all the TCP packets which are in the middle of connection, i.e. packets both without SYN flag and without connection entry in CT.
So if you want to reap the benefits of CT, you have to let him work a bit.
Now your hook is in NF_INET_PRE_ROUTING with NF_IP_PRI_FIRST priority. Just look at the picture of Linux kernel packet flow. If we talk about pre-routing chain CT-handling is somewhere after RAW table (i.e. with a lower priority).
The list of priorities you can see here:
enum nf_ip_hook_priorities {
NF_IP_PRI_FIRST = INT_MIN,
NF_IP_PRI_CONNTRACK_DEFRAG = -400,
NF_IP_PRI_RAW = -300,
NF_IP_PRI_SELINUX_FIRST = -225,
NF_IP_PRI_CONNTRACK = -200,
NF_IP_PRI_MANGLE = -150,
NF_IP_PRI_NAT_DST = -100,
NF_IP_PRI_FILTER = 0,
NF_IP_PRI_SECURITY = 50,
NF_IP_PRI_NAT_SRC = 100,
NF_IP_PRI_SELINUX_LAST = 225,
NF_IP_PRI_CONNTRACK_HELPER = 300,
NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
NF_IP_PRI_LAST = INT_MAX,
};
Thus to stick in after CT (after nf_conntrack_in()) you must register your hook with priority lower than NF_IP_PRI_CONNTRACK (i.e. with greater number, e.g. -50).
So you do:
static struct nf_hook_ops hooks[] __read_mostly = {
{
.hook = hfunc,
.pf = PF_INET,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK + 150
},
// ...
};
// ...
int ret;
ret = nf_register_hooks(hooks, ARRAY_SIZE(hooks));
if (ret < 0)
// error
Then you should access the CT info from within your hook:
static unsigned int hfunc(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state) {
struct iphdr *iph;
iph = ip_hdr(skb);
if (iph->protocol == IPPROTO_TCP) {
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
struct tcphdr *tcph;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return NF_ACCEPT;
tcph = tcp_hdr(skb)
if (tcph->syn) { // && !tcph->ack ???
if (ctinfo == IP_CT_NEW)
return NF_ACCEPT;
} else {
if (ctinfo == IP_CT_NEW)
return NF_DROP;
}
}
return NF_ACCEPT
}
Also remember that CT must be involved in your Linux kernel network processing. There should be CT modules inserted into kernel and an appropriate iptables rule added.

Related

network device No buffer Space

Hello i created Network driver that uses the uart port to send and recive.
My driver works with some issues. I was able to ping but always afther a few pings i get
ping: sendmsg: No buffer space available driver
I checked the kernel logs but i could not see anything.
this is how i recive data:
struct stm32_port *stm32_port = netdev_priv(my_net);
struct sk_buff *skb;
unsigned char *dma_start;
dma_start = stm32_port->rx_buf + (RX_BUF_L - stm32_port->last_res);
print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_OFFSET, 16, 1, dma_start, 16, true);
skb = dev_alloc_skb(dma_size + 2);
if (!skb) {
if (printk_ratelimit( ))
printk(KERN_NOTICE "snull rx: low on mem - packet dropped\n");
my_net->stats.rx_dropped++;
//goto error;
}
memcpy(skb_put(skb, dma_size), dma_start, dma_size);
/* Write metadata, and then pass to the receive level */
skb->dev = my_net;
skb->protocol = eth_type_trans(skb, my_net);
skb->ip_summed = CHECKSUM_NONE; // let the OS check the checksum
my_net->stats.rx_packets++;
my_net->stats.rx_bytes += dma_size;
netif_rx(skb);
port->icount.rx += dma_size;
stm32_port->last_res -= dma_size;
if (stm32_port->last_res == 0)
stm32_port->last_res = RX_BUF_L; //dma_count
Here is how i send my data:
struct stm32_port *lp = netdev_priv(ndev);
struct uart_port *port = &lp->port;
struct sk_buff *sk_buff;
struct dma_async_tx_descriptor *desc = NULL;
struct stm32_usart_offsets *ofs = &lp->info->ofs;
unsigned pktlen = skb->len;
dma_cookie_t cookie;
int ret = 0;
//print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, 16, true);
netif_stop_queue(ndev);
sk_buff = skb_get(skb);
if (ofs->icr == UNDEF_REG){
stm32_usart_clr_bits(port, ofs->isr, USART_SR_TC);
}else{
writel_relaxed(USART_ICR_TCCF, port->membase + ofs->icr);
}
memcpy(&lp->tx_buf[0], sk_buff->data, pktlen);
desc = dmaengine_prep_slave_single(lp->tx_ch,
lp->tx_dma_buf,
pktlen,
DMA_MEM_TO_DEV,
DMA_PREP_INTERRUPT);
if (!desc){
goto fallback_err;
}
cookie = dmaengine_submit(desc);
ret = dma_submit_error(cookie);
if (ret) {
/* dma no yet started, safe to free resources */
dmaengine_terminate_async(lp->tx_ch);
goto fallback_err;
}
/* Issue pending DMA TX requests */
dma_async_issue_pending(lp->tx_ch);
stm32_usart_set_bits(port, ofs->cr3, USART_CR3_DMAT);
/* rely on TXE irq (mask or unmask) for sending remaining data */
stm32_usart_tx_interrupt_disable(port);
ndev->stats.tx_packets++;
ndev->stats.tx_bytes += pktlen;
fallback_err:
skb_tx_timestamp(skb);
dev_kfree_skb (skb);
netif_start_queue(ndev);
return NETDEV_TX_OK;
Thanks to #stark i found a solution i free the buffer now with the __kfree_skb() methode because like that the refcount will not be checked.

AF_XDP-Socket vs Linux Sockets: Why does my AF-XDP Socket lose packets whereas a generic linux socket doesn't?

I am comparing AF-XDP sockets vs Linux Sockets in terms of how many packets they can process without packet-loss (packet-loss is defined as the RTP-sequence number of the current packet is not equal to the RTP-sequence number of the previous packet + 1).
I noticed that my AF-XDP socket program (I can't determine if this problem is related to the kernel program or the user-space program) is losing around ~25 packets per second at around 390.000 packets per second whereas an equivalent program with generic linux sockets doesn't lose any packets.
I implemented a so-called distributor-program which loads the XDP-kernel program once, sets up a generic linux socket and adds setsockopt(IP_ADD_MEMBERSHIP) to this generic socket for every multicast-address I pass to the program via command line.
After this, the distributor loads the filedescriptor of a BPF_MAP_TYPE_HASH placed in the XDP-kernel program and inserts routes for the traffic in case a single AF-XDP socket needs to share its umem later on.
The XDP-kernel program then checks for each IPv4/UDP packet if there is an entry in that hash-map. This basically looks like this:
const struct pckt_idntfy_raw raw = {
.src_ip = 0, /* not used at the moment */
.dst_ip = iph->daddr,
.dst_port = udh->dest,
.pad = 0
};
const int *idx = bpf_map_lookup_elem(&xdp_packet_mapping, &raw);
if(idx != NULL) {
if (bpf_map_lookup_elem(&xsks_map, idx)) {
bpf_printk("Found socket # index: %d!\n", *idx);
return bpf_redirect_map(&xsks_map, *idx, 0);
} else {
bpf_printk("Didn't find connected socket for index %d!\n", *idx);
}
}
In case idx exists this means that there is a socket sitting behind that index in the BPF_MAP_TYPE_XSKMAP.
After doing all that the distributor spawns a new process via fork() passing all multicast-addresses (including destination port) which should be processed by that process (one process handles one RX-Queue). In case there are not enough RX-Queues, some processes may receive multiple multicast-addresses. This then means that they are going to use SHARED UMEM.
I basically oriented my AF-XDP user-space program on this example code: https://github.com/torvalds/linux/blob/master/samples/bpf/xdpsock_user.c
I am using the same xsk_configure_umem, xsk_populate_fill_ring and xsk_configure_socket functions.
Because I figured I don't need maximum latency for this application, I send the process to sleep for a specified time (around 1 - 2ms) after which it loops through every AF-XDP socket (most of the time it is only one socket) and processes every received packet for that socket, verifying that no packets have been missed:
while(!global_exit) {
nanosleep(&spec, &remaining);
for(int i = 0; i < cfg.ip_addrs_len; i++) {
struct xsk_socket_info *socket = xsk_sockets[i];
if(atomic_exchange(&socket->stats_sync.lock, 1) == 0) {
handle_receive_packets(socket);
atomic_fetch_xor(&socket->stats_sync.lock, 1); /* release socket-lock */
}
}
}
In my opinion there is nothing too fancy about this but somehow I lose ~25 packets at around 390.000 packets even though my UMEM is close to 1GB of RAM.
In comparison, my generic linux socket program looks like this (in short):
int fd = socket(AF_INET, SOCK_RAW, IPPROTO_UDP);
/* setting some socket options */
struct sockaddr_in sin;
memset(&sin, 0, sizeof(struct sockaddr_in));
sin.sin_family = AF_INET;
sin.sin_port = cfg->ip_addrs[0]->pckt.dst_port;
inet_aton(cfg->ip_addrs[0]->pckt.dst_ip, &sin.sin_addr);
if(bind(fd, (struct sockaddr*)&sin, sizeof(struct sockaddr)) < 0) {
fprintf(stderr, "Error on binding socket: %s\n", strerror(errno));
return - 1;
}
ioctl(fd, SIOCGIFADDR, &intf);
The distributor-program creates a new process for every given multicast-ip in case generic linux sockets are used (because there are no sophisticated methods such as SHARED-UMEM in generic sockets I don't bother with multiple multicast-streams per process).
Later on I of course join the multicast membership:
struct ip_mreqn mreq;
memset(&mreq, 0, sizeof(struct ip_mreqn));
const char *multicast_ip = cfg->ip_addrs[0]->pckt.dst_ip;
if(inet_pton(AF_INET, multicast_ip, &mreq.imr_multiaddr.s_addr)) {
/* Local interface address */
memcpy(&mreq.imr_address, &cfg->ifaddr, sizeof(struct in_addr));
mreq.imr_ifindex = cfg->ifindex;
if(setsockopt(igmp_socket_fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(struct ip_mreqn)) < 0) {
fprintf(stderr, "Failed to set `IP_ADD_MEMBERSHIP`: %s\n", strerror(errno));
return;
} else {
printf("Successfully added Membership for IP: %s\n", multicast_ip);
}
}
and start processing packets (not sleeping but in a busy-loop like fashion):
void read_packets_recvmsg_with_latency(struct config *cfg, struct statistic *st, void *buff, const int igmp_socket_fd) {
char ctrl[CMSG_SPACE(sizeof(struct timeval))];
struct msghdr msg;
struct iovec iov;
msg.msg_control = (char*)ctrl;
msg.msg_controllen = sizeof(ctrl);
msg.msg_name = &cfg->ifaddr;
msg.msg_namelen = sizeof(cfg->ifaddr);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
iov.iov_base = buff;
iov.iov_len = BUFFER_SIZE;
struct timeval time_user, time_kernel;
struct cmsghdr *cmsg = (struct cmsghdr*)&ctrl;
const int64_t read_bytes = recvmsg(igmp_socket_fd, &msg, 0);
if(read_bytes == -1) {
return;
}
gettimeofday(&time_user, NULL);
if(cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_TIMESTAMP) {
memcpy(&time_kernel, CMSG_DATA(cmsg), sizeof(struct timeval));
}
if(verify_rtp(cfg, st, read_bytes, buff)) {
const double timediff = (time_user.tv_sec - time_kernel.tv_sec) * 1000000 + (time_user.tv_usec - time_kernel.tv_usec);
if(timediff > st->stats.latency_us) {
st->stats.latency_us = timediff;
}
}
}
int main(...) {
....
while(!is_global_exit) {
read_packets_recvmsg_with_latency(&cfg, &st, buffer, igmp_socket_fd);
}
}
That's pretty much it.
Please not that in the described use case where I start to lose packets I don't use SHARED UMEM, it's just a single RX-Queue receiving a multicast-stream. In case I process a smaller multicast-stream of around 150.000 pps - the AF-XDP solution doesn't lose any packets. But it is also the other way around - for around 520.000 pps on the same RX-Queue (using SHARED UMEM) I get a loss of 12.000 pps.
Any ideas what I am missing?

Obtaining real device of VLAN-interface through the netlink

I need to obtain the real_dev (f.e. ID) of the given VLAN-inteface.
I wrote some test snippet using libnl:
int main(void) {
struct nl_sock *sock;
struct nl_cache *cache;
char iface[] = "eno1.10";
//char iface[] = "eno1";
if (!(sock = nl_socket_alloc())) {
perror("nl_socket_alloc");
return -1;
}
if (nl_connect(sock, NETLINK_ROUTE) < 0) {
perror("nl_connect");
nl_socket_free( sock );
return -1;
}
if (rtnl_link_alloc_cache(sock, AF_UNSPEC, &cache) < 0) {
perror("rtnl_link_alloc_cache");
nl_socket_free( sock );
nl_close( sock );
return -1;
}
{
int ifindex;
struct rtnl_link *link = NULL;
if (!(ifindex = rtnl_link_name2i(cache, iface))) {
perror("rtnl_link_name2i");
return -1;
}
printf("ind: %d\n", ifindex);
if (!(link = rtnl_link_get(cache, ifindex))) {
perror("rtnl_link_get");
return -1;
}
if (rtnl_link_is_vlan(link)) {
puts("It's VLAN link");
/* alas it's not about the 'real' device */
printf("master: %d\n", rtnl_link_get_master(link));
} else
puts("It's 'real' link");
}
return 0;
}
So I have some interface ID and I can check if it's a VLAN-interface, but I have no idea how to obtain the interface the vlan is attached to.
It seems that libnl's API does not provide such possibility.
Is there a way to obtain the VLAN's "parent" interface ID through the libnl or the native netlink API?
It's all about IFLA_LINK:
/* IFLA_LINK.
For usual devices it is equal ifi_index.
If it is a "virtual interface" (f.e. tunnel), ifi_link
can point to real physical interface (f.e. for bandwidth calculations),
or maybe 0, what means, that real media is unknown (usual
for IPIP tunnels, when route to endpoint is allowed to change)
*/
Thus through the native netlink API it could be done such a way:
/* some preparation code */
struct rtattr *rta = IFLA_RTA(msg);
int len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*msg));
for (; RTA_OK(rta, len); rta = RTA_NEXT(rta, len))
if (rta->rta_type == IFLA_LINK) {
printf("Real device ID:%u\n",
*(unsigned short *)((char *) rta + NLA_HDRLEN));
break;
}
Full example on github.

kernel TCP/IP implementation - about accept queue length

I'm recently learning the TCP/IP implementation in linux kernel(version 4.4), and got really confused about the accept queue. I know there is a queue in struct inet_connection_sock which is called a accept queue:
struct inet_connection_sock {
...
/* #icsk_accept_queue: FIFO of established children */
struct request_sock_queue icsk_accept_queue;
...
}
and there is a qlen member in it, I think it is used to indicate the length of the queue.
struct request_sock_queue {
...
/* length of the queue? */
atomic_t qlen;
...
};
Here is what I think I know: when a LISTEN socket receives a SYN packet, in function tcp_conn_request, inet_csk_reqsk_queue_hash_add is called to put the newly created NEW_SYN_RECV sock into ehash table (not icsk_accept_queue)
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
reqsk_queue_hash_req(req, timeout); // add to ehash table
inet_csk_reqsk_queue_added(sk); // increment icsk_accept_queue.qlen
}
But in this funcfion, inet_csk_reqsk_queue_added is called to increment icsk_accept_queue.qlen. My question is why increment qlen since nothing is inserted to icsk_accept_queue? Is it not the length of icsk_accept_queue?
Also, tcp_conn_request called inet_csk_reqsk_queue_add to add a fast open sock(if enabled) to icsk_accept_queue:
struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
struct request_sock *req,
struct sock *child)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
spin_lock(&queue->rskq_lock);
if (unlikely(sk->sk_state != TCP_LISTEN)) {
inet_child_forget(sk, req, child);
child = NULL;
} else {
req->sk = child;
req->dl_next = NULL;
if (queue->rskq_accept_head == NULL)
queue->rskq_accept_head = req;
else
queue->rskq_accept_tail->dl_next = req;
queue->rskq_accept_tail = req;
sk_acceptq_added(sk); // increment sk.sk_ack_backlog
}
spin_unlock(&queue->rskq_lock);
return child;
}
But this function ended up incrementing sk.sk_ack_backlog (by calling sk_acceptq_added) instead of icsk_accept_queue.qlen. Why not?

C Programming: Check if the IP address is added on any given NIC

Problem description:
I have a IP address (can be either IPv4/IPv6) and NIC address, how can I check if the IP address is added to the given NIC(or any NIC) using C.
I know it is simple to do the same on command line/using scripts, however I need to check the same in C Program.
Example:
IP - 192.168.0.1
NIC - eth0
Using command line(linux platform) the below command would tell me if the IP is added or not:
ip addr show | grep "192.168.0.1"
p.s.: Is there any library function which can be used to get similar outputs?
You want to use getifaddrs, which returns a list of network interfaces and the addresses associated with them.
From the man page:
int getifaddrs(struct ifaddrs **ifap);
The getifaddrs() function creates a linked list of structures
describing the network interfaces of the local system, and stores
the address of the first item of the list in *ifap. The list
consists of ifaddrs structures, defined as follows:
struct ifaddrs {
struct ifaddrs *ifa_next; /* Next item in list */
char *ifa_name; /* Name of interface */
unsigned int ifa_flags; /* Flags from SIOCGIFFLAGS */
struct sockaddr *ifa_addr; /* Address of interface */
struct sockaddr *ifa_netmask; /* Netmask of interface */
union {
struct sockaddr *ifu_broadaddr;
/* Broadcast address of interface */
struct sockaddr *ifu_dstaddr;
/* Point-to-point destination address */
} ifa_ifu;
#define ifa_broadaddr ifa_ifu.ifu_broadaddr
#define ifa_dstaddr ifa_ifu.ifu_dstaddr
void *ifa_data; /* Address-specific data */
};
Here's an example of how I've used it in one of my programs:
union sockaddr_u {
struct sockaddr_storage ss;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
};
struct iflist {
char name[IFNAME_LEN];
union sockaddr_u su;
int isloopback;
int ismulti;
int ifidx;
};
void getiflist(struct iflist *list, int *len)
{
struct ifaddrs *ifa, *ifa_tmp;
int count;
unsigned ifidx;
if (getifaddrs(&ifa) == -1) {
perror("getifaddrs failed");
*len = 0;
return;
}
ifa_tmp = ifa;
count = *len;
*len = 0;
while (ifa_tmp && (*len < count)) {
if ((ifidx = if_nametoindex(ifa_tmp->ifa_name)) == 0) {
perror("Error getting interface index for interface %s",
ifa_tmp->ifa_name);
continue;
}
if (ifa_tmp->ifa_addr && ((ifa_tmp->ifa_addr->sa_family == AF_INET) ||
(ifa_tmp->ifa_addr->sa_family == AF_INET6)) &&
((ifa_tmp->ifa_flags & IFF_UP) != 0)) {
memset(&list[*len], 0, sizeof(struct iflist));
strncpy(list[*len].name, ifa_tmp->ifa_name,
sizeof(list[*len].name) - 1);
memcpy(&list[*len].su, ifa_tmp->ifa_addr,
sizeof(struct sockaddr_storage));
list[*len].isloopback = (ifa_tmp->ifa_flags & IFF_LOOPBACK) != 0;
list[*len].ismulti = (ifa_tmp->ifa_flags & IFF_MULTICAST) != 0;
list[*len].ifidx = ifidx;
(*len)++;
}
ifa_tmp = ifa_tmp->ifa_next;
}
freeifaddrs(ifa);
}
You may want to use GETIFADDRS http://man7.org/linux/man-pages/man3/getifaddrs.3.html
This should Work:
#define _BSD_SOURCE
#include <ifaddrs.h>
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<netdb.h>
int main(void){
struct ifaddrs *ip, *hosst;
int s;
char host[NI_MAXHOST];
if (getifaddrs(&ip) == -1){
perror("getifaddrs");
return 1;
}
for (hosst = ip; hosst != NULL; hosst = hosst->ifa_next){
if (hosst->ifa_addr == NULL){
continue;
}
s=getnameinfo(hosst->ifa_addr,sizeof(struct sockaddr_in),host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST);
if((strcmp(hosst->ifa_name,"wlan0")==0)&&(hosst->ifa_addr->sa_family==AF_INET)){
if (s != 0){
printf("getnameinfo() failed: %s\n", gai_strerror(s));
return 1;
}
printf("IP - %s\n", host);
printf("NIC - %s\n",hosst->ifa_name );
}
}
free(ip);
return 0;
}
Output:
IP - 192.168.0.110
NIC - wlan0
The method(GETIFADDRS) suggested above is correct solution for the stated problem description. The getifaddrs browses through all the IP's on the NIC.
However in my case there are many IP's addresses added on the NIC/system and I am OK with knowing if the IP is present in any of the NIC. I found a easier/faster solution.
To check if the IP is added on any of the NIC card of the machine, just open a TCP socket and bind with port=zero(0) and the IP address to be checked. The successful bind will suggest that the IP is available/present.
Note: This works if IP is added on any of the NIC card in the system. The port zero should be used instead of hard-coding as it selects any available port on in the system
(ref. http://compnetworking.about.com/od/tcpip/p/port-numbers-0.htm)
This method is tested for both IPv4/IPv6 in UNIX environment(rhel)
PS:Do not forget to close the socket after verifying the presence of IP

Resources