Why does TCP socket slow down if done in multiple system calls?

Why does TCP socket slow down if done in multiple system calls? - c

Why is the following code slow? And by slow I mean 100x-1000x slow. It just repeatedly performs read/write directly on a TCP socket. The curious part is that it remains slow only if I use two function calls for both read AND write as shown below. If I change either the server or the client code to use a single function call (as in the comments), it becomes super fast.
Code snippet:
int main(...) {
int sock = ...; // open TCP socket
int i;
char buf[100000];
for(i=0;i<2000;++i)
{ if(amServer)
{ write(sock,buf,10);
// read(sock,buf,20);
read(sock,buf,10);
read(sock,buf,10);
}else
{ read(sock,buf,10);
// write(sock,buf,20);
write(sock,buf,10);
write(sock,buf,10);
}
}
close(sock);
}
We stumbled on this in a larger program, that was actually using stdio buffering. It mysteriously became sluggish the moment payload size exceeded the buffer size by a small margin. Then I did some digging around with strace, and finally boiled the problem down to this. I can solve this by fooling around with buffering strategy, but I'd really like to know what on earth is going on here. On my machine, it goes from 0.030 s to over a minute on my machine (tested both locally and over remote machines) when I change the two read calls to a single call.
These tests were done on various Linux distros, and various kernel versions. Same result.
Fully runnable code with networking boilerplate:
#include <netdb.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <netinet/ip.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
static int getsockaddr(const char* name,const char* port, struct sockaddr* res)
{
struct addrinfo* list;
if(getaddrinfo(name,port,NULL,&list) < 0) return -1;
for(;list!=NULL && list->ai_family!=AF_INET;list=list->ai_next);
if(!list) return -1;
memcpy(res,list->ai_addr,list->ai_addrlen);
freeaddrinfo(list);
return 0;
}
// used as sock=tcpConnect(...); ...; close(sock);
static int tcpConnect(struct sockaddr_in* sa)
{
int outsock;
if((outsock=socket(AF_INET,SOCK_STREAM,0))<0) return -1;
if(connect(outsock,(struct sockaddr*)sa,sizeof(*sa))<0) return -1;
return outsock;
}
int tcpConnectTo(const char* server, const char* port)
{
struct sockaddr_in sa;
if(getsockaddr(server,port,(struct sockaddr*)&sa)<0) return -1;
int sock=tcpConnect(&sa); if(sock<0) return -1;
return sock;
}
int tcpListenAny(const char* portn)
{
in_port_t port;
int outsock;
if(sscanf(portn,"%hu",&port)<1) return -1;
if((outsock=socket(AF_INET,SOCK_STREAM,0))<0) return -1;
int reuse = 1;
if(setsockopt(outsock,SOL_SOCKET,SO_REUSEADDR,
(const char*)&reuse,sizeof(reuse))<0) return fprintf(stderr,"setsockopt() failed\n"),-1;
struct sockaddr_in sa = { .sin_family=AF_INET, .sin_port=htons(port)
, .sin_addr={INADDR_ANY} };
if(bind(outsock,(struct sockaddr*)&sa,sizeof(sa))<0) return fprintf(stderr,"Bind failed\n"),-1;
if(listen(outsock,SOMAXCONN)<0) return fprintf(stderr,"Listen failed\n"),-1;
return outsock;
}
int tcpAccept(const char* port)
{
int listenSock, sock;
listenSock = tcpListenAny(port);
if((sock=accept(listenSock,0,0))<0) return fprintf(stderr,"Accept failed\n"),-1;
close(listenSock);
return sock;
}
void writeLoop(int fd,const char* buf,size_t n)
{
// Don't even bother incrementing buffer pointer
while(n) n-=write(fd,buf,n);
}
void readLoop(int fd,char* buf,size_t n)
{
while(n) n-=read(fd,buf,n);
}
int main(int argc,char* argv[])
{
if(argc<3)
{ fprintf(stderr,"Usage: round {server_addr|--} port\n");
return -1;
}
bool amServer = (strcmp("--",argv[1])==0);
int sock;
if(amServer) sock=tcpAccept(argv[2]);
else sock=tcpConnectTo(argv[1],argv[2]);
if(sock<0) { fprintf(stderr,"Connection failed\n"); return -1; }
int i;
char buf[100000] = { 0 };
for(i=0;i<4000;++i)
{
if(amServer)
{ writeLoop(sock,buf,10);
readLoop(sock,buf,20);
//readLoop(sock,buf,10);
//readLoop(sock,buf,10);
}else
{ readLoop(sock,buf,10);
writeLoop(sock,buf,20);
//writeLoop(sock,buf,10);
//writeLoop(sock,buf,10);
}
}
close(sock);
return 0;
}
EDIT: This version is slightly different from the other snippet in that it reads/writes in a loop. So in this version, two separate writes automatically causes two separate read() calls, even if readLoop is called only once. But otherwise the problem still remains.

Interesting. You are being a victim of the Nagle's algorithm together with TCP delayed acknowledgements.
The Nagle's algorithm is a mechanism used in TCP to defer transmission of small segments until enough data has been accumulated that makes it worth building and sending a segment over the network. From the wikipedia article:
Nagle's algorithm works by combining a number of small outgoing
messages, and sending them all at once. Specifically, as long as there
is a sent packet for which the sender has received no acknowledgment,
the sender should keep buffering its output until it has a full
packet's worth of output, so that output can be sent all at once.
However, TCP typically employs something known as TCP delayed acknowledgements, which is a technique that consists of accumulating together a batch of ACK replies (because TCP uses cumulative ACKS), to reduce network traffic.
That wikipedia article further mentions this:
With both algorithms enabled, applications that do two successive
writes to a TCP connection, followed by a read that will not be
fulfilled until after the data from the second write has reached the
destination, experience a constant delay of up to 500 milliseconds,
the "ACK delay".
(Emphasis mine)
In your specific case, since the server doesn't send more data before reading the reply, the client is causing the delay: if the client writes twice, the second write will be delayed.
If Nagle's algorithm is being used by the sending party, data will be
queued by the sender until an ACK is received. If the sender does not
send enough data to fill the maximum segment size (for example, if it
performs two small writes followed by a blocking read) then the
transfer will pause up to the ACK delay timeout.
So, when the client makes 2 write calls, this is what happens:
Client issues the first write.
The server receives some data. It doesn't acknowledge it in the hope that more data will arrive (so it can batch up a bunch of ACKs in one single ACK).
Client issues the second write. The previous write has not been acknowledged, so Nagle's algorithm defers transmission until more data arrives (until enough data has been collected to make a segment) or the previous write is ACKed.
Server is tired of waiting and after 500 ms acknowledges the segment.
Client finally completes the 2nd write.
With 1 write, this is what happens:
Client issues the first write.
The server receives some data. It doesn't acknowledge it in the hope that more data will arrive (so it can batch up a bunch of ACKs in one single ACK).
The server writes to the socket. An ACK is part of the TCP header, so if you're writing, you might as well acknowledge the previous segment at no extra cost. Do it.
Meanwhile, the client wrote once, so it was already waiting on the next read - there was no 2nd write waiting for the server's ACK.
If you want to keep writing twice on the client side, you need to disable the Nagle's algorithm. This is the solution proposed by the algorithm author himself:
The user-level solution is to avoid write-write-read sequences on
sockets. write-read-write-read is fine. write-write-write is fine. But
write-write-read is a killer. So, if you can, buffer up your little
writes to TCP and send them all at once. Using the standard UNIX I/O
package and flushing write before each read usually works.
(See the citation on Wikipedia)
As mentioned by David Schwartz in the comments, this may not be the greatest idea for various reasons, but it illustrates the point and shows that this is indeed causing the delay.
To disable it, you need to set the TCP_NODELAY option on the sockets with setsockopt(2).
This can be done in tcpConnectTo() for the client:
int tcpConnectTo(const char* server, const char* port)
{
struct sockaddr_in sa;
if(getsockaddr(server,port,(struct sockaddr*)&sa)<0) return -1;
int sock=tcpConnect(&sa); if(sock<0) return -1;
int val = 1;
if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) < 0)
perror("setsockopt(2) error");
return sock;
}
And in tcpAccept() for the server:
int tcpAccept(const char* port)
{
int listenSock, sock;
listenSock = tcpListenAny(port);
if((sock=accept(listenSock,0,0))<0) return fprintf(stderr,"Accept failed\n"),-1;
close(listenSock);
int val = 1;
if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)) < 0)
perror("setsockopt(2) error");
return sock;
}
It's interesting to see the huge difference this makes.
If you'd rather not mess with the socket options, it's enough to ensure that the client writes once - and only once - before the next read. You can still have the server read twice:
for(i=0;i<4000;++i)
{
if(amServer)
{ writeLoop(sock,buf,10);
//readLoop(sock,buf,20);
readLoop(sock,buf,10);
readLoop(sock,buf,10);
}else
{ readLoop(sock,buf,10);
writeLoop(sock,buf,20);
//writeLoop(sock,buf,10);
//writeLoop(sock,buf,10);
}
}

Related

Parallel TCP connection using threads

I am trying to build a system that opens parallel TCP sockets using threads.
My threads are triggered using message queue IPC , thus every time a packet arrive to the message queue a thread "wakes up" , open TCP connection with remote server and send the packet.
My problem is that in Wireshark , I can see the the time it takes to send a file is smaller using threads instead of one connection , but the throughput does not change.
My questions are :
How can i verify my threads working parallel?
How can i improve this code?,
3.How can i open several sockets using one thread?
I am using Virtual machine to run the multithreaded clients.
The IDE I am using is Clion , language is C.
My code:
#include<stdio.h>
#include<stdlib.h>
#include<sys/socket.h>
#include<string.h>
#include <arpa/inet.h>
#include <unistd.h> // for close
#include<pthread.h>
#include <math.h>
#include<malloc.h>
#include<signal.h>
#include<stdbool.h>
#include<sys/types.h>
#include<linux/if_packet.h>
#include<netinet/in.h>
#include<netinet/if_ether.h> // for ethernet header
#include<netinet/ip.h> // for ip header
#include<netinet/udp.h> // for udp header
#include<netinet/tcp.h>
#include <byteswap.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <mqueue.h>
#include <assert.h>
#include <time.h>
#define QUEUE_NAME "/ServerDan_Queue"
#define QUEUE_PERM 0660
#define MAX_MESSAGES 10 //Max size = 10
#define MAX_MSG_SIZE 4105 //Max size = 8192B
#define MSG_BUFFER_SIZE MAX_MSG_SIZE+10
#define BSIZE 1024
#define Nbytes 4096
#define ElorServer_addr "192.168.1.54"
///params:
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
struct sockaddr_in server;
struct stat obj;
int sock;
int k, size, status;
int i = 0;
typedef struct frag
{
int packet_number;
int seq;
uint8_t data[4096];
bool lastfrag;
} fragma;
void * middlemanThread(void *arg)
{
///========================================///
///**** Waiting for message queue trigger!:///
///=======================================///
long id = (long)arg;
id+=1;
mqd_t qd; //queue descriptor
//open the queue for reading//
qd= mq_open(QUEUE_NAME,O_RDONLY);
assert(qd != -1);
struct mq_attr attr;
assert(mq_getattr(qd,&attr) != -1);
uint8_t *income_buf = calloc(attr.mq_msgsize,1);
uint8_t *cast_buf = calloc(attr.mq_msgsize,1);
assert(income_buf);
fragma frag;
struct timespec timeout;
clock_gettime(CLOCK_REALTIME,&timeout);
timeout.tv_sec+=50;
//bool closesoc =false;
printf("Waiting for messages ..... \n\n");
while(1){
///========================================///
///**** Open message queue fo receive:///
///=======================================///
if((mq_timedreceive(qd,income_buf,attr.mq_msgsize,0,&timeout))<0){
printf("Failed to receive message for 50 sec \n");
//closesoc =true;
pthread_exit(NULL);
}
else{
cast_buf = income_buf;
printf("Received successfully , your msg :\n");
frag.packet_number = *cast_buf;
cast_buf = (cast_buf + sizeof(int));
frag.seq = *cast_buf;
cast_buf = (cast_buf + sizeof(int));
memccpy(frag.data,((fragma*)cast_buf)->data,0,Nbytes);
cast_buf = cast_buf + Nbytes;
frag.lastfrag = *cast_buf;
uint8_t * data = frag.data;
}
pthread_mutex_lock(&lock);
///========================================///
///**** Connecting to Server and send Frament:///
///=======================================///
int size = sizeof(( fragma *)income_buf)->packet_number + sizeof(( fragma *)income_buf)->seq + sizeof(( fragma *)income_buf)->data + sizeof(( fragma *)income_buf)->lastfrag;
printf("In thread\n");
int clientSocket;
struct sockaddr_in serverAddr;
socklen_t addr_size;
// Create the socket.
clientSocket = socket(PF_INET, SOCK_STREAM, 0);
//Configure settings of the server address
// Address family is Internet
serverAddr.sin_family = AF_INET;
//Set port number, using htons function
serverAddr.sin_port = htons(8081);
//Set IP address to localhost
serverAddr.sin_addr.s_addr = inet_addr("192.168.14.149");
memset(serverAddr.sin_zero, '\0', sizeof serverAddr.sin_zero);
//Connect the socket to the server using the address
addr_size = sizeof serverAddr;
connect(clientSocket, (struct sockaddr *) &serverAddr, addr_size);
if(send(clientSocket , income_buf , size,0) < 0)
{
printf("Send failed\n");
}
printf("Trhead Id : %ld \n" , id);
printf("Packet number : %d \n Seq = %d \n lasfrag = %d\n\n",frag.packet_number,frag.seq,(int)frag.lastfrag);
pthread_mutex_unlock(&lock);
//if(closesoc)
close(clientSocket);
usleep(20000);
}
}
int main(){
int i = 0;
pthread_t tid[5];
while(i< 5)
{
if( pthread_create(&tid[i], NULL, middlemanThread, (void*)i) != 0 )
printf("Failed to create thread\n");
i++;
}
sleep(2);
i = 0;
while(i< 5)
{
pthread_join(tid[i++],NULL);
printf("Thread ID : %d:\n",i);
}
return 0;
}

thus every time a packet arrive to the message queue a thread "wakes up" , open TCP connection with remote server and send the packet
If you're at all concerned about speed or efficiency, don't do this. The single most expensive thing you can do with a TCP socket is the initial connection. You're doing a 3-way handshake just to send a single message!
Then, you're holding a global mutex while doing this entire operation - which, again, is the slowest operation in your program.
The current design is effectively single-threaded, but in the most complicated and expensive possible way.
I can see the the time it takes to send a file is smaller using threads instead of one connection , but the throughput does not change
I have no idea what you're actually measuring, and it's not at all clear that you do either. What is a file? One fragment? Multiple fragments? How big is it compared to your MTU? Have you checked that the fragments are actually received in the correct order, because it looks to me like the only possible parallelism is the spot where that could break.
How is it possible to have lower latency and unchanged throughput for a single file?
How can i verify my threads working parallely?
If you see multiple TCP connections in wireshark with different source ports, and their packets are interleaved, you have effective parallelism. This is unlikely though as you explicitly prohibited it with your global mutex!
What is the best way to check the throughput in wireshark?
Don't. Use wireshark for inspecting packets, use the server to determine throughput. That's where the results actually matter.
3.Is the concept of parallel TCP suppose to increase the throughput?
Why did you implement all this complexity if you don't know what it's for?
There's a good chance a single thread (correctly coded with no spurious mutex thrashing) can saturate your network, so: no. Having multiple I/O threads is generally about conveniently partitioning your logic and state (ie, having one client per thread, or different unrelated I/O subsystems in different threads), rather than performance.
If you want to pull packets off a message queue and send them to TCP, the performant way is to:
use a single thread just doing this (your program may have other threads doing other things - avoid synchronizing with them if possible)
open a single persistent TCP connection to the server and don't connect/close it for every fragment
that's it. It's much simpler than what you have and will perform much better.
You can realistically have one thread handling multiple different connections, but I can't see any way this would be useful in your case, so keep it simple.

Here is a partial answer to:
3.Is the concept of parallel TCP suppose to increase the throughput?
kinda. it really depends on what the bottleneck is.
First possible bottleneck is cogestion control. TCP sender has a limit on how much packets can be sent at once (before an ACK for the first of the bunch is received), called congestion window. This number should start small and grow over time. Also, if a packet is lost this number is reduced by half and then grow slowly back untill next drop occurs. The limit is however per one TCP connection, so if you spread your data over several parallel connections the overall congestion window (sum of all windows of all flows) will grow faster and drop for lesser amount. (this is a summary, for details you need to know how congestion control works, and it is a big topic). This should happen irrespectively of whether you are using threads or not. You can open several connections in one thread, and, achieve the same effect.
Second possible bottleneck is the network processing in the OS. AFAIK this is an issue starting with 10Gb connections. Maybe 1Gb, but probably not. TCP processing happens in OS not in your application. You may achieve better performance if the processing is spread between processors by the OS (there should be parameters to enable this), and maybe a little better performance because of caches.
If you are reading files from disk, you disk IO can also very well be a bottleneck. In this case I don't think that spreading sending data between different threads is actually going to help.

Is there a size limit of write() for a socket fd?

I am writing a little web server which involves epoll and multithread. For small and short http/1.1 requests and responses, it works as expected. But when working with large size file downloads, it is always interrupted by the timer I devised. I expire the timers with a fixed timeout value, but I also have a if statement to check if the response was sent successfully.
static void
_expire_timers(list_t *timers, long timeout)
{
httpconn_t *conn;
int sockfd;
node_t *timer;
long cur_time;
long stamp;
timer = list_first(timers);
if (timer) {
cur_time = mstime();
do {
stamp = list_node_stamp(timer);
conn = (httpconn_t *)list_node_data(timer);
if ((cur_time - stamp >= timeout) && httpconn_close(conn)) {
sockfd = httpconn_sockfd(conn);
DEBSI("[CONN] socket closed, server disconnected", sockfd);
close(sockfd);
list_del(timers, stamp);
}
timer = list_next(timers);
} while (timer);
}
}
I realized that in a non-blocking environment, the write() function might be interrupted during the request-response communication. I wonder how long write() can hold or how much data write() can send, so I can tweek the timout setting in my code.
This is the code which involves write(),
void
http_rep_get(int clifd, void *cache, char *path, void *req)
{
httpmsg_t *rep;
int len_msg;
char *bytes;
rep = _get_rep_msg((list_t *)cache, path, req);
bytes = msg_create_rep(rep, &len_msg);
/* send msg */
DEBSI("[REP] Sending reply msg...", clifd);
write(clifd, bytes, len_msg);
/* send body */
DEBSI("[REP] Sending body...", clifd);
write(clifd, msg_body_start(rep), msg_body_len(rep));
free(bytes);
msg_destroy(rep, 0);
}
And the following is the epoll loop I use to process the incoming requests,
do {
nevents = epoll_wait(epfd, events, MAXEVENTS, HTTP_KEEPALIVE_TIME);
if (nevents == -1) perror("epoll_wait()");
/* expire the timers */
_expire_timers(timers, HTTP_KEEPALIVE_TIME);
/* loop through events */
for (i = 0; i < nevents; i++) {
conn = (httpconn_t *)events[i].data.ptr;
sockfd = httpconn_sockfd(conn);
/* error case */
if ((events[i].events & EPOLLERR) || (events[i].events & EPOLLHUP) ||
(!(events[i].events & EPOLLIN))) {
perror("EPOLL ERR|HUP");
list_update(timers, conn, mstime());
break;
}
else if (sockfd == srvfd) {
_receive_conn(srvfd, epfd, cache, timers);
}
else {
/* client socket; read client data and process it */
thpool_add_task(taskpool, httpconn_task, conn);
}
}
} while (svc_running);
The http_rep_get() is executed by the threadpool handler httpconn_task(), HTTP_KEEPALIVE_TIME is the fixed timeout. The handler httpconn_task() will add a timer to the timers once a request arrives. Since the write() is executed in http_rep_get(), I think it might be interrupted by the timers. I guess I can change the way to write to the clients, but I need to make sure how much the write() can do.
If you are interested, you may browser my project to help me with this.
https://github.com/grassroot72/Maestro
Cheers,
Edward

Is there a size limit of write() for a socket fd?
It depends on what you mean by a limit.
As the comments explain, a write call may write fewer bytes than you ask it to. Furthermore, this is expected behavior if you perform a large write to a socket. However, there is no reliable way to determine (or predict) how many bytes will be written before you call write.
The correct way to deal with this is to check how many bytes were actually written each time, and use a loop for ensure that all bytes are written (or until you get a failure).

multiple requests with recv() call [duplicate]

I have an application that reads large files from a server and hangs frequently on a particular machine. It has worked successfully under RHEL5.2 for a long time. We have recently upgraded to RHEL6.1 and it now hangs regularly.
I have created a test app that reproduces the problem. It hangs approx 98 times out of 100.
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <netdb.h>
#include <sys/socket.h>
#include <sys/time.h>
int mFD = 0;
void open_socket()
{
struct addrinfo hints, *res;
memset(&hints, 0, sizeof(hints));
hints.ai_socktype = SOCK_STREAM;
hints.ai_family = AF_INET;
if (getaddrinfo("localhost", "60000", &hints, &res) != 0)
{
fprintf(stderr, "Exit %d\n", __LINE__);
exit(1);
}
mFD = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
if (mFD == -1)
{
fprintf(stderr, "Exit %d\n", __LINE__);
exit(1);
}
if (connect(mFD, res->ai_addr, res->ai_addrlen) < 0)
{
fprintf(stderr, "Exit %d\n", __LINE__);
exit(1);
}
freeaddrinfo(res);
}
void read_message(int size, void* data)
{
int bytesLeft = size;
int numRd = 0;
while (bytesLeft != 0)
{
fprintf(stderr, "reading %d bytes\n", bytesLeft);
/* Replacing MSG_WAITALL with 0 works fine */
int num = recv(mFD, data, bytesLeft, MSG_WAITALL);
if (num == 0)
{
break;
}
else if (num < 0 && errno != EINTR)
{
fprintf(stderr, "Exit %d\n", __LINE__);
exit(1);
}
else if (num > 0)
{
numRd += num;
data += num;
bytesLeft -= num;
fprintf(stderr, "read %d bytes - remaining = %d\n", num, bytesLeft);
}
}
fprintf(stderr, "read total of %d bytes\n", numRd);
}
int main(int argc, char **argv)
{
open_socket();
uint32_t raw_len = atoi(argv[1]);
char raw[raw_len];
read_message(raw_len, raw);
return 0;
}
Some notes from my testing:
If "localhost" maps to the loopback address 127.0.0.1, the app hangs on the call to recv() and NEVER returns.
If "localhost" maps to the ip of the machine, thus routing the packets via the ethernet interface, the app completes successfully.
When I experience a hang, the server sends a "TCP Window Full" message, and the client responds with a "TCP ZeroWindow" message (see image and attached tcpdump capture). From this point, it hangs forever with the server sending keep-alives and the client sending ZeroWindow messages. The client never seems to expand its window, allowing the transfer to complete.
During the hang, if I examine the output of "netstat -a", there is data in the servers send queue but the clients receive queue is empty.
If I remove the MSG_WAITALL flag from the recv() call, the app completes successfully.
The hanging issue only arises using the loopback interface on 1 particular machine. I suspect this may all be related to timing dependencies.
As I drop the size of the 'file', the likelihood of the hang occurring is reduced
The source for the test app can be found here:
Socket test source
The tcpdump capture from the loopback interface can be found here:
tcpdump capture
I reproduce the issue by issuing the following commands:
> gcc socket_test.c -o socket_test
> perl -e 'for (1..6000000){ print "a" }' | nc -l 60000
> ./socket_test 6000000
This sees 6000000 bytes sent to the test app which tries to read the data using a single call to recv().
I would love to hear any suggestions on what I might be doing wrong or any further ways to debug the issue.

MSG_WAITALL should block until all data has been received. From the manual page on recv:
This flag requests that the operation block until the full request is satisfied.
However, the buffers in the network stack probably are not large enough to contain everything, which is the reason for the error messages on the server. The client network stack simply can't hold that much data.
The solution is either to increase the buffer sizes (SO_RCVBUF option to setsockopt), split the message into smaller pieces, or receiving smaller chunks putting it into your own buffer. The last is what I would recommend.
Edit: I see in your code that you already do what I suggested (read smaller chunks with own buffering,) so just remove the MSG_WAITALL flag and it should work.
Oh, and when recv returns zero, that means the other end have closed the connection, and that you should do it too.

Consider these two possible rules:
The receiver may wait for the sender to send more before receiving what has already been sent.
The sender may wait for the receiver to receive what has already been sent before sending more.
We can have either of these rules, but we cannot have both of these rules.
Why? Because if the receiver is permitted to wait for the sender, that means the sender cannot wait for the receiver to receive before sending more, otherwise we deadlock. And if the sender is permitted to wait for the receiver, that means the receiver cannot wait for the sender to send before receiving more, otherwise we deadlock.
If both of these things happen at the same time, we deadlock. The sender will not send more until the receiver receives what has already been sent, and the receiver will not receive what has already been sent unless the sender send more. Boom.
TCP chooses rule 2 (for reasons that should be obvious). Thus it cannot support rule 1. But in your code, you are the receiver, and you are waiting for the sender to send more before you receive what has already been sent. So this will deadlock.

Flush kernel's TCP buffer for `MSG_MORE`-flagged packets

send()'s man page reveals the MSG_MORE flag which is asserted to act like TCP_CORK. I have a wrapper function around send():
int SocketConnection_Write(SocketConnection *this, void *buf, int len) {
errno = 0;
int sent = send(this->fd, buf, len, MSG_NOSIGNAL);
if (errno == EPIPE || errno == ENOTCONN) {
throw(exc, &SocketConnection_NotConnectedException);
} else if (errno == ECONNRESET) {
throw(exc, &SocketConnection_ConnectionResetException);
} else if (sent != len) {
throw(exc, &SocketConnection_LengthMismatchException);
}
return sent;
}
Assuming I want to use the kernel buffer, I could go with TCP_CORK, enable whenever it is necessary and then disable it to flush the buffer. But on the other hand, thereby the need for an additional system call arises. Thus, the usage of MSG_MORE seems more appropriate to me. I'd simply change the above send() line to:
int sent = send(this->fd, buf, len, MSG_NOSIGNAL | MSG_MORE);
According to lwm.net, packets will be flushed automatically if they are large enough:
If an application sets that option on
a socket, the kernel will not send out
short packets. Instead, it will wait
until enough data has shown up to fill
a maximum-size packet, then send it.
When TCP_CORK is turned off, any
remaining data will go out on the
wire.
But this section only refers to TCP_CORK. Now, what is the proper way to flush MSG_MORE packets?
I can only think of two possibilities:
Call send() with an empty buffer and without MSG_MORE being set
Re-apply the TCP_CORK option as described on this page
Unfortunately the whole topic is very poorly documented and I couldn't find much on the Internet.
I am also wondering how to check that everything works as expected? Obviously running the server through strace is not an option. So the simplest way would be to use netcat and then look at its strace output? Or will the kernel handle traffic transmitted over a loopback interface differently?

I have taken a look at the kernel source and both assumptions seem to be true. The following code are extracts from net/ipv4/tcp.c (2.6.33.1).
static inline void tcp_push(struct sock *sk, int flags, int mss_now,
int nonagle)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_send_head(sk)) {
struct sk_buff *skb = tcp_write_queue_tail(sk);
if (!(flags & MSG_MORE) || forced_push(tp))
tcp_mark_push(tp, skb);
tcp_mark_urg(tp, flags, skb);
__tcp_push_pending_frames(sk, mss_now,
(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
}
}
Hence, if the flag is not set, the pending frames will definitely be flushed. But this is be only the case when the buffer is not empty:
static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
size_t psize, int flags)
{
(...)
ssize_t copied;
(...)
copied = 0;
while (psize > 0) {
(...)
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
} else if (skb == tcp_send_head(sk))
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
out:
if (copied)
tcp_push(sk, flags, mss_now, tp->nonagle);
return copied;
do_error:
if (copied)
goto out;
out_err:
return sk_stream_error(sk, flags, err);
}
The while loop's body will never be executed because psize is not greater 0. Then, in the out section, there is another chance, tcp_push() gets called but because copied still has its default value, it will fail as well.
So sending a packet with the length 0 will never result in a flush.
The next theory was to re-apply TCP_CORK. Let's take a look at the code first:
static int do_tcp_setsockopt(struct sock *sk, int level,
int optname, char __user *optval, unsigned int optlen)
{
(...)
switch (optname) {
(...)
case TCP_NODELAY:
if (val) {
/* TCP_NODELAY is weaker than TCP_CORK, so that
* this option on corked socket is remembered, but
* it is not activated until cork is cleared.
*
* However, when TCP_NODELAY is set we make
* an explicit push, which overrides even TCP_CORK
* for currently queued segments.
*/
tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
tcp_push_pending_frames(sk);
} else {
tp->nonagle &= ~TCP_NAGLE_OFF;
}
break;
case TCP_CORK:
/* When set indicates to always queue non-full frames.
* Later the user clears this option and we transmit
* any pending partial frames in the queue. This is
* meant to be used alongside sendfile() to get properly
* filled frames when the user (for example) must write
* out headers with a write() call first and then use
* sendfile to send out the data parts.
*
* TCP_CORK can be set together with TCP_NODELAY and it is
* stronger than TCP_NODELAY.
*/
if (val) {
tp->nonagle |= TCP_NAGLE_CORK;
} else {
tp->nonagle &= ~TCP_NAGLE_CORK;
if (tp->nonagle&TCP_NAGLE_OFF)
tp->nonagle |= TCP_NAGLE_PUSH;
tcp_push_pending_frames(sk);
}
break;
(...)
As you can see, there are two ways to flush. You can either set TCP_NODELAY to 1 or TCP_CORK to 0. Luckily, both won't check whether the flag is already set. Thus, my initial plan to re-apply the TCP_CORK flag can be optimized to just disable it, even if it's currently not set.
I hope this helps someone with similar issues.

That's a lot of research... all I can offer is this empirical post note:
Sending a bunch of packet with MSG_MORE set, followed by a packet without MSG_MORE, the whole lot goes out. It works a treat for something like this:
for (i=0; i<mg_live.length; i++) {
// [...]
if ((n = pth_send(sock, query, len, MSG_MORE | MSG_NOSIGNAL)) < len) {
printf("error writing to socket (sent %i bytes of %i)\n", n, len);
exit(1);
}
}
}
pth_send(sock, "END\n", 4, MSG_NOSIGNAL);
That is, when you're sending out all the packets at once, and have a clearly defined end... AND you are only using one socket.
If you tried writing to another socket in the middle of the above loop, you may find that Linux releases the previously held packets. At least that appears to be the trouble I'm having right now. But it might be an easy solution for you.

(How) Can I reduce socket latency?

I have written an HTTP proxy that does some stuff that's not relevant here, but it is increasing the client's time-to-serve by a huge amount (600us without proxy vs 60000us with it). I think I have found where the bulk of that time is coming from - between my proxy finishing sending back to the client and the client finishing receiving it. For now, server, proxy and client are running on the same host, using localhost as the addresses.
Once the proxy has finished sending (once it has returned from send() at least), I print the result of gettimeofday which gives an absolute time. When my client has received, it prints the result of gettimeofday. Since they're both on the same host, this should be accurate. All send() calls are with no flags, so they are blocking. The difference between the two is about 40000us.
The proxy's socket on which it listens for client connections is set up with the hints AF_UNSPEC, SOCK_STREAM and AI_PASSIVE. Presumably a socket from accept()ing on that will have the same parameters?
If I'm understanding all this correctly, Apache manages to do everything in 600us (including the equivalent of whatever is causing this 40000us delay). Can anybody suggest what might be causing this? I have tried setting the TCP_NODELAY option (I know I shouldn't, it's just to see if it made a difference) and the delay between finishing sending and finishing receiving went right down, I forget the number but <1000us.
This is all on Ubuntu Linux 2.6.31-19. Thanks for any help

40ms is the TCP ACK delay on Linux, which indicates that you are likely encountering a bad interaction between delayed acks and the Nagle algorithm. The best way to address this is to send all of your data using a single call to send() or sendmsg(), before waiting for a response. If that is not possible then certain TCP socket options including TCP_QUICKACK (on the receiving side), TCP_CORK (sending side), and TCP_NODELAY (sending side) can help, but can also hurt if used improperly. TCP_NODELAY simply disables the Nagle algorithm and is a one-time setting on the socket, whereas the other two must be set at the appropriate times during the life of the connection and can therefore be trickier to use.

You can't really do meaningful performance measurements on a proxy with the client, proxy and origin server on the same host.
Place them all on different hosts on a network. Use real hardware machines for them all, or specialised hardware test systems (e.g. Spirent).
Your methodology makes no sense. Nobody has 600us of latency to their origin server in practice anyway. Running all the tasks on the same host creates contention and a wholly unreaslistic network environment.

INTRODUCTION:
I already praised mark4o for the truly correct answer to the general question of lowering latency. I would like to translate the answer in terms of how it helped solve my latency issue because I think it's going to be the answer most people come here looking for.
ANSWER:
In a real-time network app (such as a multiplayer game) where getting short messages between nodes as quickly as possible is critical, TURN NAGLE OFF. In most cases this means setting the "no-delay" flag to true.
DISCLAIMER:
While this may not solve the OP specific problem, most people who come here will probably be looking for this answer to the general question of their latency issues.
ANECDOTAL BACK-STORY:
My game was doing fine until I added code to send two messages separately, but they were very close to each other in execution time. Suddenly, I was getting 250ms extra latency. As this was a part of a larger code change, I spent two days trying to figure out what my problem was. When I combined the two messages into one, the problem went away. Logic led me to mark4o's post and so I set the .Net socket member "NoDelay" to true, and I can send as many messages in a row as I want.

From e.g. the RedHat documentation:
Applications that require lower latency on every packet sent should be run on sockets with TCP_NODELAY enabled. It can be enabled through the setsockopt command with the sockets API:
int one = 1;
setsockopt(descriptor, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
For this to be used effectively, applications must avoid doing small, logically related buffer writes. Because TCP_NODELAY is enabled, these small writes will make TCP send these multiple buffers as individual packets, which can result in poor overall performance.

In your case, that 40ms is probably just a scheduler time quantum. In other words, that's how long it takes your system to get back round to the other tasks. Try it on a real network, you'll get a completely different picture. If you have a multi-core machine, using virtual OS instances in Virtualbox or some other VM would give you a much better idea of what is really going to happen.

For a TCP proxy it would seem prudent on the LAN side to increase the TCP initial window size as discussed on linux-netdev and /. recently.
http://www.amailbox.org/mailarchive/linux-netdev/2010/5/26/6278007
http://developers.slashdot.org/story/10/11/26/1729218/Google-Microsoft-Cheat-On-Slow-Start-mdash-Should-You
Including paper on the topic by Google,
http://www.google.com/research/pubs/pub36640.html
And an IETF draft also by Google,
http://zinfandel.levkowetz.com/html/draft-ietf-tcpm-initcwnd-00

For Windows, I'm not sure if setting TCP_NODELAY helps. I tried that, but latency was still bad. One person suggested I try UDP, and that did the trick.
A few complicated examples of UDP did not work for me, but I ran across a simple one and it did the trick...
#include <Winsock2.h>
#include <WS2tcpip.h>
#include <system_error>
#include <string>
#include <iostream>
class WSASession
{
public:
WSASession()
{
int ret = WSAStartup(MAKEWORD(2, 2), &data);
if (ret != 0)
throw std::system_error(WSAGetLastError(), std::system_category(), "WSAStartup Failed");
}
~WSASession()
{
WSACleanup();
}
private:
WSAData data;
};
class UDPSocket
{
public:
UDPSocket()
{
sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
if (sock == INVALID_SOCKET)
throw std::system_error(WSAGetLastError(), std::system_category(), "Error opening socket");
}
~UDPSocket()
{
closesocket(sock);
}
void SendTo(const std::string& address, unsigned short port, const char* buffer, int len, int flags = 0)
{
sockaddr_in add;
add.sin_family = AF_INET;
add.sin_addr.s_addr = inet_addr(address.c_str());
add.sin_port = htons(port);
int ret = sendto(sock, buffer, len, flags, reinterpret_cast<SOCKADDR *>(&add), sizeof(add));
if (ret < 0)
throw std::system_error(WSAGetLastError(), std::system_category(), "sendto failed");
}
void SendTo(sockaddr_in& address, const char* buffer, int len, int flags = 0)
{
int ret = sendto(sock, buffer, len, flags, reinterpret_cast<SOCKADDR *>(&address), sizeof(address));
if (ret < 0)
throw std::system_error(WSAGetLastError(), std::system_category(), "sendto failed");
}
sockaddr_in RecvFrom(char* buffer, int len, int flags = 0)
{
sockaddr_in from;
int size = sizeof(from);
int ret = recvfrom(sock, buffer, len, flags, reinterpret_cast<SOCKADDR *>(&from), &size);
if (ret < 0)
throw std::system_error(WSAGetLastError(), std::system_category(), "recvfrom failed");
// make the buffer zero terminated
buffer[ret] = 0;
return from;
}
void Bind(unsigned short port)
{
sockaddr_in add;
add.sin_family = AF_INET;
add.sin_addr.s_addr = htonl(INADDR_ANY);
add.sin_port = htons(port);
int ret = bind(sock, reinterpret_cast<SOCKADDR *>(&add), sizeof(add));
if (ret < 0)
throw std::system_error(WSAGetLastError(), std::system_category(), "Bind failed");
}
private:
SOCKET sock;
};
Server
#define TRANSACTION_SIZE 8
static void startService(int portNumber)
{
try
{
WSASession Session;
UDPSocket Socket;
char tmpBuffer[TRANSACTION_SIZE];
INPUT input;
input.type = INPUT_MOUSE;
input.mi.mouseData=0;
input.mi.dwFlags = MOUSEEVENTF_MOVE;
Socket.Bind(portNumber);
while (1)
{
sockaddr_in add = Socket.RecvFrom(tmpBuffer, sizeof(tmpBuffer));
...do something with tmpBuffer...
Socket.SendTo(add, data, len);
}
}
catch (std::system_error& e)
{
std::cout << e.what();
}
Client
char *targetIP = "192.168.1.xxx";
Socket.SendTo(targetIP, targetPort, data, len);