I have the following code for my education socket server in C.
#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdio.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
double get_wall_time()
{
struct timeval time;
if (gettimeofday(&time, NULL)){
return 0;
}
return (double)time.tv_sec + (double)time.tv_usec * 0.000001;
}
double get_cpu_time()
{
return (double)clock() / CLOCKS_PER_SEC;
}
int main()
{
double wall = get_wall_time();
double cpu = get_cpu_time();
int sfd = socket(AF_INET, SOCK_STREAM, 0);
struct sockaddr_in own_addr = {0};
own_addr.sin_family = AF_INET;
own_addr.sin_port = htons(5678);
bind(sfd, (struct sockaddr *)&own_addr, sizeof(own_addr));
listen(sfd, 5);
static char message[] = "hello from server\n";
double wall_accept = 0;
double cpu_accept = 0;
int count = 0;
while (1) {
if (count++ == 1000) {
break;
}
double wall_start = get_wall_time();
double cpu_start = get_cpu_time();
int client_sfd = accept(sfd, NULL, NULL);
wall_accept += get_wall_time() - wall_start;
cpu_accept += get_cpu_time() - cpu_start;
send(client_sfd, message, sizeof(message), 0);
close(client_sfd);
}
wall = get_wall_time() - wall;
cpu = get_cpu_time() - cpu;
printf("wall accept: %lf\n", wall_accept);
printf("cpu accept: %lf\n", cpu_accept);
printf("wall: %lf\n", wall);
printf("cpu: %lf\n", cpu);
}
To test I use seq 1000 | time parallel -j 1 -n0 'nc 127.0.0.1 5678' | wc -l with results
wall accept: 6.436480
cpu accept: 0.010000
wall: 6.456266
cpu: 0.020000
For 10000 requests result is
wall accept: 55.434541
cpu accept: 0.080000
wall: 55.633679
cpu: 0.260000
Is accept() slow or I do something wrong? Or maybe this is normal result for single-thread implementation?
UPD. I also wrote a server with pthreads to send a message in different thread.
#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdio.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include <pthread.h>
#include <stdlib.h>
#include <sys/time.h>
double get_wall_time()
{
struct timeval time;
if (gettimeofday(&time, NULL)){
return 0;
}
return (double)time.tv_sec + (double)time.tv_usec * 0.000001;
}
double get_cpu_time()
{
return (double)clock() / CLOCKS_PER_SEC;
}
void *send_message(void *pclient_sfd)
{
int client_sfd = *(int *)pclient_sfd;
free(pclient_sfd);
static char message[] = "hello from server\n";
send(client_sfd, message, sizeof(message), 0);
close(client_sfd);
return NULL;
}
int main()
{
double wall = get_wall_time();
double cpu = get_cpu_time();
int sfd = socket(AF_INET, SOCK_STREAM, 0);
struct sockaddr_in own_addr = {0};
own_addr.sin_family = AF_INET;
own_addr.sin_port = htons(5678);
bind(sfd, (struct sockaddr *)&own_addr, sizeof(own_addr));
listen(sfd, 5);
double wall_accept = 0;
double cpu_accept = 0;
int count = 0;
while (1) {
if (count++ == 10000) {
break;
}
int *pclient_sfd = malloc(sizeof(*pclient_sfd));
double wall_start = get_wall_time();
double cpu_start = get_cpu_time();
*pclient_sfd = accept(sfd, NULL, NULL);
wall_accept += get_wall_time() - wall_start;
cpu_accept += get_cpu_time() - cpu_start;
pthread_t tid;
pthread_create(&tid, NULL, send_message, (void *)pclient_sfd);
}
wall = get_wall_time() - wall;
cpu = get_cpu_time() - cpu;
printf("wall accept: %lf\n", wall_accept);
printf("cpu accept: %lf\n", cpu_accept);
printf("wall: %lf\n", wall);
printf("cpu: %lf\n", cpu);
return 0;
}
Then I use seq 10000 | time parallel -j 4 -n0 'nc 127.0.0.1 5678' | wc -l and it takes 58 seconds.
It's the way you're testing it. When you use this
seq 10000 | time parallel -j 4 -n0 'nc 127.0.0.1 5678' | wc -l
That's actually going to impact the test because you're spawning lots of processes etc ie you're not actually testing the C application your testing the ability to spawn processes.
If we change this to a simple python script ie
#!/usr/bin/env python
import socket
TCP_IP = '127.0.0.1'
TCP_PORT = 5678
BUFFER_SIZE = 1024
msg = "1"
for i in range(0, 1000):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((TCP_IP, TCP_PORT))
s.send(msg.encode('utf-8'))
data = s.recv(BUFFER_SIZE)
s.close()
print("received data: %s" % data)
and run the test the results are vastly different.
real 0m0.269s
user 0m0.074s
sys 0m0.114s
If you really want to test this and see how fast it is you need to use separate machines etc and typically you might need to use C or your limiting factor might be the client. The best tool I've seen for this (HTTP Specific) where you might find some good code is wrk
https://github.com/wg/wrk
Take a look at your listen() second argument. Try to increase it.
here is text from man 2 listen
The backlog argument defines the maximum length to which the queue of
pending connections for sockfd may grow. If a connection request
arrives when the queue is full, the client may receive an error with
an indication of ECONNREFUSED or, if the underlying protocol supports
retransmission, the request may be ignored so that a later reattempt
at connection succeeds.
Related
Update
With the added pthreaded C client, the problem is recreated, indicating the long connection times are part of the TCP protocol, rather than specific implementations. Altering those protocols doesn't seem easily available.
Initial Question
I believe my question is largely: What does the Golang net package do when attempting to connect to a server over TCP and:
The server has no connections available, even in backlog.
The connection is not refused/failed.
There seems to be a large amount of overhead in that connection with server response times ramping up from 5 ms to several seconds. This was seen both in a production environment and in the minimal example below. The proper solution is to use connection pools to the server, which will be implemented. This is largely my curiosity.
To reproduce:
Run server with backlog = 1, run client.go.
All 50 goroutines fire at once, with a total completion time of almost 2 minutes.
Run server with backlog = 100, run client.go.
All 50 goroutines fire at once, queue up connected to the server, and complete in ~260 ms.
Running three C clients utilizing 50 us retry times was able to complete connections within 12 ms on average, so didn't see this issue.
Example output for backlog = 1 (first time is time to dial, second is time to completion):
user#computer ~/tcp-tests $ go run client.go 127.0.0.1:46999
Long Elapsed Time: 216.579µs, 315.196µs
Long Elapsed Time: 274.169µs, 5.970873ms
Long Elapsed Time: 74.4µs, 10.753871ms
Long Elapsed Time: 590.965µs, 205.851066ms
Long Elapsed Time: 1.029287689s, 1.029574065s
Long Elapsed Time: 1.02945649s, 1.035098229s
...
Long Elapsed Time: 3.045881865s, 6.378597166s
Long Elapsed Time: 3.045314838s, 6.383783688s
Time taken stats: 2.85 +/- 1.59 s // average +/- STDEV
Main Taken: 6.384677948s
Example output for backlog = 100:
...
Long Elapsed Time: 330.098µs, 251.004077ms
Long Elapsed Time: 298.146µs, 256.187795ms
Long Elapsed Time: 315.832µs, 261.523685ms
Time taken stats: 0.13 +/- 0.08 s
Main Taken: 263.186955ms
So what's going on under the hood of net.DialTCP (we used other flavors of dial as well, with no discernible difference) that causes the dial time to grow?
Polling time between attempts to make a connection?
An RFC 5681 Global Congestion Control (likely including mutex lock?) variable that gets incremented on all the initial failed connection attempts?
Something else?
I'm leaning towards the first two, as the 1s, 3s, 5s values seem to be magic numbers. They show up both on my modest local machine, and a large scale production environment.
Here is the minimal server written in C. The configuration value of interest is the backlog argument to listen.
/*
Adapted from
https://www.geeksforgeeks.org/tcp-server-client-implementation-in-c/
Compile and run with:
gcc server.c -o server; ./server
*/
#include <stdio.h>
#include <string.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <sys/time.h>
int main(void)
{
int socket_desc, client_sock, client_size;
struct sockaddr_in server_addr, client_addr;
char server_message[2000], client_message[2000];
// Clean buffers:
memset(server_message, '\0', sizeof(server_message));
memset(client_message, '\0', sizeof(client_message));
// Create socket:
socket_desc = socket(AF_INET, SOCK_STREAM, 0);
if(socket_desc < 0){
printf("Error while creating socket\n");
return -1;
}
printf("Socket created successfully\n");
// Set port and IP:
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(46999);
server_addr.sin_addr.s_addr = inet_addr("127.0.0.1");
// Bind to the set port and IP:
if(bind(socket_desc, (struct sockaddr*)&server_addr, sizeof(server_addr))<0){
printf("Couldn't bind to the port\n");
return -1;
}
printf("Done with binding\n");
// Listen for clients:
// Increasing the backlog allows the Go client to connect and wait
// rather than poll/retry.
if(listen(socket_desc, 100) < 0){
printf("Error while listening\n");
return -1;
}
printf("\nListening for incoming connections.....\n");
// Accept an incoming connection:
client_size = sizeof(client_addr);
int server_run = 1;
do
{
struct timeval start, end;
double cpu_time_used;
gettimeofday(&start, NULL);
client_sock = accept(socket_desc, (struct sockaddr*)&client_addr, &client_size);
if (client_sock < 0){
printf("Can't accept\n");
return -1;
}
// Receive client's message:
if (recv(client_sock, client_message, sizeof(client_message), 0) < 0){
printf("Couldn't receive\n");
return -1;
}
if (strcmp(client_message, "stop") == 0)
{
server_run = 0;
printf("Received stop message.\n");
}
// Respond to client:
strcpy(server_message, "This is the server's message.");
if (send(client_sock, server_message, strlen(server_message), 0) < 0){
printf("Can't send\n");
return -1;
}
// sleep for 5 ms
usleep(5000);
// Closing the socket:
close(client_sock);
gettimeofday(&end, NULL);
cpu_time_used = (end.tv_usec - start.tv_usec) / 1000.0;
if (cpu_time_used > 0.0) // overflow in tv_usec if negative
printf("Server Time: %.4f ms\n", cpu_time_used);
} while(server_run);
close(socket_desc);
return 0;
}
Here is the testing Go client
/*
Adapted from
https://www.linode.com/docs/guides/developing-udp-and-tcp-clients-and-servers-in-go/
Run once the server.c is compiled and running with:
go run client.go 127.0.0.1:46999
*/
package main
import (
"fmt"
"net"
"os"
"time"
"github.com/montanaflynn/stats"
"sync"
)
func do_message(wg *sync.WaitGroup, connect string, time_taken *float64) {
defer wg.Done()
message := make([]byte, 128)
start_time := time.Now()
pAddr, err := net.ResolveTCPAddr("tcp", connect)
if err != nil {
return
}
c, err := net.DialTCP("tcp", nil, pAddr)
if err != nil {
fmt.Println(err)
return
}
c.SetLinger(0)
dialed_time := time.Since(start_time)
defer func() {
c.Close()
elapsed_time := time.Since(start_time)
if elapsed_time.Microseconds() > 60 { // microseconds
fmt.Println("Long Elapsed Time: " + dialed_time.String() + ", " + elapsed_time.String())
}
*time_taken = float64(elapsed_time.Microseconds())
}()
text := "{\"service\": \"magic_service_str\"}"
c.Write([]byte(text))
code, _ := c.Read(message) // Does not actually wait for response.
code = code
}
func main() {
main_start := time.Now()
arguments := os.Args
if len(arguments) == 1 {
fmt.Println("Please provide host:port.")
return
}
n_messages := 50
wg := new(sync.WaitGroup)
wg.Add(n_messages)
times := make([]float64, n_messages)
for i := 0; i < n_messages; i++ {
// Used to turn the goroutines into serial implementation
// time.Sleep(5500 * time.Microsecond)
go do_message(wg, arguments[1], ×[i])
}
wg.Wait()
avg, _ := stats.Mean(times)
std, _ := stats.StandardDeviation(times)
fmt.Println("Time taken stats: " + fmt.Sprintf("%.2f", avg / 1000000.0) + " +/- " + fmt.Sprintf("%.2f", std / 1000000.0) + " s")
main_taken := time.Since(main_start)
fmt.Println("Main Taken: " + main_taken.String())
}
Updated pthreaded client in C and confirmed the issue is not the Golang implementation:
// gcc client_p.c -o pclient -lpthread
#include <stdio.h>
#include <string.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <stdlib.h>
#include<sys/time.h>
#include <pthread.h>
#include <errno.h>
#ifndef THREAD_LOOP_COUNT
#define THREAD_LOOP_COUNT 1
#endif
/* Subtract the ‘struct timeval’ values X and Y,
storing the result in RESULT.
Return 1 if the difference is negative, otherwise 0.
https://www.gnu.org/software/libc/manual/html_node/Calculating-Elapsed-Time.html
*/
int
timeval_subtract (struct timeval *result, struct timeval *x, struct timeval *y)
{
/* Perform the carry for the later subtraction by updating y. */
if (x->tv_usec < y->tv_usec) {
int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
y->tv_usec -= 1000000 * nsec;
y->tv_sec += nsec;
}
if (x->tv_usec - y->tv_usec > 1000000) {
int nsec = (x->tv_usec - y->tv_usec) / 1000000;
y->tv_usec += 1000000 * nsec;
y->tv_sec -= nsec;
}
/* Compute the time remaining to wait.
tv_usec is certainly positive. */
result->tv_sec = x->tv_sec - y->tv_sec;
result->tv_usec = x->tv_usec - y->tv_usec;
/* Return 1 if result is negative. */
return x->tv_sec < y->tv_sec;
}
static void* workerThreadFunc(void* arg)
{
int socket_desc;
struct sockaddr_in server_addr;
char server_message[2000], client_message[2000];
// Clean buffers:
memset(server_message,'\0',sizeof(server_message));
memset(client_message,'\0',sizeof(client_message));
// Set port and IP the same as server-side:
server_addr.sin_family = AF_INET;
server_addr.sin_port = htons(46999);
server_addr.sin_addr.s_addr = inet_addr("127.0.0.1");
int retries = 0;
struct timeval start, end, difference;
double cpu_time_used;
for(int i = 0; i < THREAD_LOOP_COUNT; i++)
{
gettimeofday(&start, NULL);
// Create socket:
socket_desc = socket(AF_INET, SOCK_STREAM, 0);
if(socket_desc < 0){
printf("Unable to create socket\n");
return;
}
// Send connection request to server:
while(connect(socket_desc, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0){
retries++;
if (retries > 10)
{
printf("Unable to connect\n");
retries = 0;
}
usleep(5);
}
int retries = 0;
// Send the message to server:
if(send(socket_desc, client_message, strlen("client message."), 0) < 0){
printf("Unable to send message\n");
close(socket_desc);
return;
}
// Receive the server's response:
if(recv(socket_desc, server_message, sizeof(server_message), 0) < 0){
printf("Error while receiving server's msg\n");
close(socket_desc);
return;
}
// Close the socket:
close(socket_desc);
gettimeofday(&end, NULL);
timeval_subtract (&difference, &end, &start);
double cpu_time_used = (double)difference.tv_sec + (double)difference.tv_usec / 1000000.0;
printf("Client Time: %.4e s\n", cpu_time_used);
}
}
int main(int argc, char **argv)
{
int n_threads = 50; // default value
if (argc > 1)
n_threads = atoi(argv[1]);
pthread_t *threads = (pthread_t*)malloc(n_threads * sizeof(pthread_t));
struct timeval start, end, difference;
gettimeofday(&start, NULL);
for(int i = 0; i < n_threads; i++)
{
int createRet = pthread_create(&threads[i], NULL, workerThreadFunc, NULL);
if (createRet != 0)
{
printf("failed to create thread\n");
}
}
for(int i = 0; i < n_threads; i++)
pthread_join(threads[i], NULL);
gettimeofday(&end, NULL);
timeval_subtract (&difference, &end, &start);
double cpu_time_used = (double)difference.tv_sec + (double)difference.tv_usec / 1000000.0;
printf("Total Client Time: %.4e s\n", cpu_time_used);
free(threads);
return 0;
}
As indicated by #user207421, the issue lies in the TCP implementation, which includes an exponential backoff on retries. Neither Golang nor C appear to have an easy way to alter this behavior.
The answer is: Don't open and close connections of TCP if you high throughput--use a connection pool.
There was some work looking at removing the exponential backoff, linked below, but there is likely a better solution for specific cases. There was for me.
ACM SIGCOMM Computer Communication Review, "Removing Exponential Backoff from TCP", Volume 38, Number 5, October 2008.
Trying to create a raw socket based program using mmap_packet to send packets at fast rate.
The following code is adopted from the example at this gist. It does send packets but it doesn't send it fast. On my 1Gbps nic (r8169 driver), it only sends at a rate of about 95,000 packets/second on my corei7 processor (3.1GHz). I believe it could have sent at much higher rate.
Not sure what is the bottleneck. Any ideas? Thanks!
Here is the code snippet:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
#include <arpa/inet.h>
#include <netinet/if_ether.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <linux/if.h>
#include <linux/if_packet.h>
#include <sys/time.h>
#define PACKET_QDISC_BYPASS 20
/// The number of frames in the ring
// This number is not set in stone. Nor are block_size, block_nr or frame_size
#define CONF_RING_FRAMES 1024
#define CONF_DEVICE "eth0"
/// Offset of data from start of frame
#define PKT_OFFSET (TPACKET_ALIGN(sizeof(struct tpacket_hdr)) + \
TPACKET_ALIGN(sizeof(struct sockaddr_ll)))
/// (unimportant) macro for loud failure
#define RETURN_ERROR(lvl, msg) \
do { \
fprintf(stderr, msg); \
return lvl; \
} while(0);
static struct sockaddr_ll txring_daddr;
double getTS() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec/1000000.0;
}
/// create a linklayer destination address
// #param ringdev is a link layer device name, such as "eth0"
static int
init_ring_daddr(int fd, const char *ringdev)
{
struct ifreq ifreq;
// get device index
strcpy(ifreq.ifr_name, ringdev);
if (ioctl(fd, SIOCGIFINDEX, &ifreq)) {
perror("ioctl");
return -1;
}
txring_daddr.sll_family = AF_PACKET;
txring_daddr.sll_protocol = htons(ETH_P_IP);
txring_daddr.sll_ifindex = ifreq.ifr_ifindex;
// set the linklayer destination address
// NOTE: this should be a real address, not ff.ff....
txring_daddr.sll_halen = ETH_ALEN;
memset(&txring_daddr.sll_addr, 0xff, ETH_ALEN);
return 0;
}
/// Initialize a packet socket ring buffer
// #param ringtype is one of PACKET_RX_RING or PACKET_TX_RING
static char *
init_packetsock_ring(int fd, int ringtype)
{
struct tpacket_req tp;
char *ring;
// tell kernel to export data through mmap()ped ring
tp.tp_block_size = CONF_RING_FRAMES * getpagesize();
tp.tp_block_nr = 1;
tp.tp_frame_size = getpagesize();
tp.tp_frame_nr = CONF_RING_FRAMES;
if (setsockopt(fd, SOL_PACKET, ringtype, (void*) &tp, sizeof(tp))) {
perror("setting up ring");
RETURN_ERROR(NULL, "setsockopt() ring\n");
}
#ifdef TPACKET_V2
printf("it's TPACKET_V2\n");
val = TPACKET_V1;
setsockopt(fd, SOL_PACKET, PACKET_HDRLEN, &val, sizeof(val));
#endif
// open ring
ring = mmap(0, tp.tp_block_size * tp.tp_block_nr,
PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (!ring)
RETURN_ERROR(NULL, "mmap()\n");
if (init_ring_daddr(fd, CONF_DEVICE))
return NULL;
return ring;
}
/// Create a packet socket. If param ring is not NULL, the buffer is mapped
// #param ring will, if set, point to the mapped ring on return
// #return the socket fd
static int
init_packetsock(char **ring, int ringtype)
{
int fd;
// open packet socket
//fd = socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_IP));
//fd = socket(AF_INET,SOCK_RAW,htons(ETH_P_ALL)); //ETH_P_ALL = 3
fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (fd < 0) {
perror("open socket");
RETURN_ERROR(-1, "Root priliveges are required\nsocket() rx. \n");
}
if (ring) {
*ring = init_packetsock_ring(fd, ringtype);
if (!*ring) {
close(fd);
return -1;
}
}
return fd;
}
static int
exit_packetsock(int fd, char *ring)
{
if (munmap(ring, CONF_RING_FRAMES * getpagesize())) {
perror("munmap");
return 1;
}
if (close(fd)) {
perror("close");
return 1;
}
return 0;
}
/// transmit a packet using packet ring
// NOTE: for high rate processing try to batch system calls,
// by writing multiple packets to the ring before calling send()
//
// #param pkt is a packet from the network layer up (e.g., IP)
// #return 0 on success, -1 on failure
static int process_tx(int fd, char *ring, const char *pkt, int pktlen)
{
static int ring_offset = 0;
struct tpacket_hdr *header;
struct pollfd pollset;
char *off;
int ret;
// fetch a frame
// like in the PACKET_RX_RING case, we define frames to be a page long,
// including their header. This explains the use of getpagesize().
header = (void *) ring + (ring_offset * getpagesize());
assert((((unsigned long) header) & (getpagesize() - 1)) == 0);
while (header->tp_status != TP_STATUS_AVAILABLE) {
// if none available: wait on more data
pollset.fd = fd;
pollset.events = POLLOUT;
pollset.revents = 0;
ret = poll(&pollset, 1, 1000 /* don't hang */);
if (ret < 0) {
if (errno != EINTR) {
perror("poll");
return -1;
}
//return 0;
}
}
// fill data
off = ((void *) header) + (TPACKET_HDRLEN - sizeof(struct sockaddr_ll));
memcpy(off, pkt, pktlen);
// fill header
header->tp_len = pktlen;
header->tp_status = TP_STATUS_SEND_REQUEST;
// increase consumer ring pointer
ring_offset = (ring_offset + 1) & (CONF_RING_FRAMES - 1);
// notify kernel
return 0;
}
/// Example application that opens a packet socket with rx_ring
int main(int argc, char **argv)
{
char *ring;
char pkt[125] = {0x00,0x0c,0x29,0xa4,0xff,0xbc,0x40,0x25,0xc2,0xd9,0xfb,0x8c,0x08,0x00,0x45,0x00,0x00,0x6f,0x24,0x1b,0x40,0x00,0x40,0x06,0x02,0x4b,0x0a,0x00,0x00,0x07,0x0a,0x00,0x00,0x1d,0xb8,0x64,0x01,0xbb,0x80,0x9e,0xaa,0x77,0x17,0x6d,0xa2,0x04,0x80,0x18,0x00,0x73,0x03,0xa0,0x00,0x00,0x01,0x01,0x08,0x0a,0x01,0x27,0x8e,0xaf,0x00,0x01,0xe8,0x71,0x16,0x03,0x01,0x00,0x36,0x01,0x00,0x00,0x32,0x03,0x02,0x55,0xf5,0x01,0xa9,0xc0,0xca,0xae,0xd6,0xd2,0x9b,0x6a,0x79,0x6d,0x9a,0xe8,0x9d,0x78,0xe2,0x64,0x98,0xf0,0xac,0xcb,0x2c,0x0d,0x51,0xa5,0xf8,0xc4,0x0f,0x93,0x87,0x00,0x00,0x04,0x00,0x35,0x00,0xff,0x01,0x00,0x00,0x05,0x00,0x0f,0x00,0x01,0x01};
int fd;
printf("page size %x\n", getpagesize());
fd = init_packetsock(&ring, PACKET_TX_RING);
if (fd < 0)
return 1;
// TODO: make correct IP packet out of pkt
int i;
double startTs = getTS();
double currentTs;
int pktCnt = 0;
int sendCnt = 0;
while (1) {
for (i=0; i<1000; i++) {
pkt[1] ++; pktCnt++;
process_tx(fd, ring, pkt, 125);
}
if (sendto(fd, NULL, 0, 0, (void *) &txring_daddr, sizeof(txring_daddr)) < 0) {
perror("sendto");
return -1;
}
sendCnt++;
usleep(300);
currentTs = getTS();
if ((currentTs - startTs) >= 1.0) {
startTs += 1.0;
printf("%7d %6d\n", pktCnt, sendCnt);
pktCnt = 0; sendCnt = 0;
}
}
if (exit_packetsock(fd, ring))
return 1;
printf("OK\n");
return 0;
}
UPDATE1
The current NIC is RealTek RTL8111/8168/8411 NIC. After upgrading the driver to the version as of 8.044, the rate goes up to 135K/second.
Ran the same program on Intel 82577LM Gigabit NIC, got about 430K/seconds rate.
I'd like to use nanomsg as bus-system. So I tried to code a performance test and testing it by using two PCs.
At first I wrote an server, which connects to the other server:
#include <nanomsg/nn.h>
#include <nanomsg/bus.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
int main(int argc, char *argv[]) {
if (argc == 2) {
int socket = nn_socket (AF_SP_RAW, NN_BUS);
assert(nn_bind (socket, "tcp://*:27384") != -1);
assert(nn_connect (socket, argv[1]) != -1);
assert(nn_device(socket, -1) != -1);
nn_close(socket);
}
}
In my case I run these commands as:
On the first PC:./server tcp://192.168.1.11:27384
On the second PC:./server tcp://192.168.1.241:27384
They are connected, to prove it, I used nanocat and connected it locally to the server:
On the first PC:
nanocat --bus --connect tcp://127.0.0.1:27384 --data foo --interval 1 --ascii
On the second PC:
nanocat --bus --connect tcp://127.0.0.1:27384 --data bar --interval 1 --ascii
On the first PC, I received a 'bar' every second, on the second PC a 'foo', also every second.
So I wrote now the receiver.
#include <nanomsg/nn.h>
#include <nanomsg/bus.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
int main(int argc, char *argv[]) {
int socket = nn_socket (AF_SP, NN_BUS);
assert(nn_connect (socket, "tcp://127.0.0.1:27384") != -1);
sleep(1);
unsigned char buffer[4096];
while(1) {
int n = nn_recv(socket, buffer, 4096, 0);
if (n > 0) {
nn_send(socket, buffer, strlen(buffer), 0);
}
}
nn_close(socket);
}
It receives a message and sends it back.
Then I wrote the sender:
#include <nanomsg/nn.h>
#include <nanomsg/bus.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>
#include <string.h>
#define NANO_PER_SEC 1000000000.0
int main(int argc, char *argv[]) {
int socket = nn_socket (AF_SP, NN_BUS);
nn_connect (socket, "tcp://127.0.0.1:27384");
sleep(1);
unsigned char buffer[4096];
int i = 0;
for (i = 0; i < 1024; i++) {
buffer[i] = 'a';
}
buffer[i] = '\0';
struct timespec start, end;
double start_sec, end_sec, elapsed_sec;
double average;
double m[4096];
for (i = 0; i < 4096; i++) {
clock_gettime(CLOCK_REALTIME, &start);
int ns = nn_send(socket, buffer, strlen(buffer), 0);
int nr = nn_recv(socket, buffer, 4096, 0);
clock_gettime(CLOCK_REALTIME, &end);
start_sec = start.tv_sec + start.tv_nsec/NANO_PER_SEC;
end_sec = end.tv_sec + end.tv_nsec/NANO_PER_SEC;
m[i] = end_sec - start_sec;
}
elapsed_sec = 0.0;
for (i = 0; i < 4096; i++) {
elapsed_sec = elapsed_sec + m[i];
}
average = (elapsed_sec / 4096.0) * 1000000.0;
printf("Average: %.3f micros\nWhole: %.12f seconds\n", average, elapsed_sec);
nn_close(socket);
}
The sender transmits 4096 times 1kbyte to the receiver and measures the time,
so I get the whole time and the average time.
At first I test it at only one PC locally, in three opened bash-terminals.
First terminal:./server tcp://192.168.1.11:27384
Second terminal:./receiver
Third terminal:./sender
So, I got from the "sender" programme this output:
Average: 60.386 micros
Whole: 0.247341632843 seconds
Then I tried to run this:
On first PC:
./server tcp://192.168.1.11:27384
./receiver
On second PC:
./server tcp://192.168.1.241:27384
./sender
But it stucks, the first PC running the "receiver" doesn't receive any message from the second PC, which runs the "sender". I don't get it whats wrong, because with nanocat it works fine.
Can somebody please help me?
I write a test program as follows:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/msg.h>
#include <time.h>
#define PACKET_SIZE 500
#define LOOP_COUNT 30000
int g_sndsucc = 0;
int g_sndfail = 0;
const int C_IPC_KEY = 0x00231a95;
const int COUNT_SIZE = 10000;
unsigned long g_count = 0;
unsigned long g_t1 = 0;
struct timeval s1, s2, s3, s4;
int main(int argc, char* argv[])
{
int ipckey = C_IPC_KEY;
if(argc > 1)
{
ipckey = atoi(argv[1]);
printf("ipckey is %d\n", ipckey);
}
int qid = msgget(ipckey, IPC_CREAT | 0666);
if(qid <= 0)
{
printf("msgget err: %d \n", errno);
return 0;
}
char data[PACKET_SIZE];
memset(data, 'a', PACKET_SIZE-1);
data[PACKET_SIZE-1] = '\0';
*((long *)data) = 0;
int ret = 0;
struct timeval start;
gettimeofday (&start, NULL);
while(1)
{
*((long *)data) +=1;
gettimeofday (&s1, NULL);
ret = msgsnd(qid, data, PACKET_SIZE,0);
gettimeofday (&s2, NULL);
if(ret != 0)
{
g_sndfail ++;
}
else
{
g_sndsucc ++;
}
g_count++;
g_t1 += (s2.tv_sec-s1.tv_sec)*1000000 + (s2.tv_usec-s1.tv_usec);
if ( g_count >= 10000)
{
printf("STAT1: t1 : %f\n",
10000000000.0 / g_t1);
g_count = 0;
g_t1 = 0;
}
usleep(1000);
}
return 0;
}
I create 100 same processes to msgsnd , and on suse, each process's msgsnd tps only reaches 50/s.
But on AIX5 the msgsnd tps can reaches 10000/s.
Does anyone know why the performance of IPC on linux when multi-processes is so poor?
And how to increase the performance on linux??
BTW, the kenel version of suse is linux 3.0.13
I checked the source code of the msgget in linux3.8.
When the thread did not get the msg lock, it is not release cpu and sleep some time.
Instead it will call ipc_lock_by_ptr(&msq->q_perm); frequently.
So the cpu usage will be very high, and the collision rate will grow rapidly when the threads increas.
How can I calculate or estimate the RTT (Round Trip Time) between client and server?
A tutorial or sample addressing this can also help.
Here what I do:
#include <rpc/rpc.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/times.h>
#include <fcntl.h>
#include <time.h>
int main(int argc, char *argv[]) {
enum clnt_stat status;
CLIENT *handle;
struct timeval t;
clock_t rtime;
struct tms dumm;
int count = 100000;
int i;
time_t now;
char stamp[27];
int programm;
int version;
if (argc != 4) {
printf("Usage: rpcping <host> <program> <version>\n");
exit(1);
}
/*
* Create Client Handle
*/
programm = atoi(argv[2]);
version = atoi(argv[3]);
handle = clnt_create(argv[1], programm, version, "tcp");
if (handle == NULL) {
printf("clnt failed\n");
exit(1);
}
/*
* use 30 seconds timeout
*/
t.tv_sec = 30;
t.tv_usec = 0;
while (1) {
rtime = times(&dumm);
for (i = 0; i < count; i++) {
status = clnt_call(handle, 0, (xdrproc_t) xdr_void,
NULL, (xdrproc_t) xdr_void, NULL, t);
if (status == RPC_SUCCESS) { /* NOP */ }
}
now = time(NULL);
ctime_r(&now, stamp);
stamp[strlen(stamp) - 1] = '\0';
fprintf(stdout, "[%s]: Speed: %2.4fs.\n", stamp,
count / ((double) (times(&dumm) - rtime) / (double) sysconf(_SC_CLK_TCK)));
fflush(stdout);
}
clnt_destroy(handle);
}
I have a multithread version as well
https://gist.github.com/2401404
tigran.