Project

General

Profile

Bug #1437 ยป 0001-Implement-autosizing-TCP-socket-buffers.patch

pavalos, 07/26/2009 12:16 AM

View differences:

sys/kern/uipc_socket.c
ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE];
ai.p_ucred = p->p_ucred;
ai.fd_rdir = p->p_fd->fd_rdir;
/*
* Auto-sizing of socket buffers is managed by the protocols and
* the appropriate flags must be set in the pru_attach function.
*/
error = so_pru_attach(so, proto, &ai);
if (error) {
so->so_state |= SS_NOFDREF;
......
error = ENOBUFS;
goto bad;
}
(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
&so->so_rcv)->ssb_flags &= ~SSB_AUTOSIZE;
break;
/*
sys/kern/uipc_socket2.c
so->so_linger = head->so_linger;
so->so_state = head->so_state | SS_NOFDREF;
so->so_proto = head->so_proto;
so->so_timeo = head->so_timeo;
so->so_cred = crhold(head->so_cred);
ai.sb_rlimit = NULL;
ai.p_ucred = NULL;
......
sodealloc(so);
return (NULL);
}
so->so_rcv.ssb_lowat = head->so_rcv.ssb_lowat;
so->so_snd.ssb_lowat = head->so_snd.ssb_lowat;
so->so_rcv.ssb_timeo = head->so_rcv.ssb_timeo;
so->so_snd.ssb_timeo = head->so_snd.ssb_timeo;
so->so_rcv.ssb_flags |= head->so_rcv.ssb_flags & SSB_AUTOSIZE;
so->so_snd.ssb_flags |= head->so_snd.ssb_flags & SSB_AUTOSIZE;
if (connstatus) {
TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
so->so_state |= SS_COMP;
sys/kern/uipc_usrreq.c
!(so->so_proto->pr_flags & PR_RIGHTS))
return(0);
#ifdef notdef
if (so->so_rcv.sb_flags & SB_LOCK) {
if (so->so_rcv.ssb_flags & SSB_LOCK) {
/*
* This is problematical; it's not clear
* we need to wait for the sockbuf to be
sys/netinet/tcp.h
#define TCP_MSS 1460
/*
* TCP_MINMSS is defined to be 256 which is fine for the smallest
* link MTU (296 bytes, SLIP interface) in the Internet.
* However it is very unlikely to come across such low MTU interfaces
* these days (anno dato 2003).
* Probably it can be set to 512 without ill effects. But we play safe.
* See tcp_subr.c tcp_minmss SYSCTL declaration for more comments.
* Setting this to "0" disables the minmss check.
*/
#define TCP_MINMSS 256
/*
* TCP_MINMSSOVERLOAD is defined to be 1000 which should cover any type
* of interactive TCP session.
* See tcp_subr.c tcp_minmssoverload SYSCTL declaration and tcp_input.c
* for more comments.
* Setting this to "0" disables the minmssoverload check.
*/
#define TCP_MINMSSOVERLOAD 1000
/*
* Default maximum segment size for TCP6.
* With an IP6 MSS of 1280, this is 1220,
* but 1024 is probably more convenient. (xxx kazu in doubt)
sys/netinet/tcp_input.c
&tcp_reass_overflows, 0,
"Global number of TCP Segment Reassembly Queue Overflows");
int tcp_do_autorcvbuf = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
&tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
int tcp_autorcvbuf_inc = 16*1024;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
&tcp_autorcvbuf_inc, 0,
"Incrementor step size of automatic receive buffer");
int tcp_autorcvbuf_max = 16*1024*1024;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
&tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
static void tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t);
static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
......
KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN state"));
/*
* This is the second part of the MSS DoS prevention code (after
* minmss on the sending side) and it deals with too many too small
* tcp packets in a too short timeframe (1 second).
*
* For every full second we count the number of received packets
* and bytes. If we get a lot of packets per second for this connection
* (tcp_minmssoverload) we take a closer look at it and compute the
* average packet size for the past second. If that is less than
* tcp_minmss we get too many packets with very small payload which
* is not good and burdens our system (and every packet generates
* a wakeup to the process connected to our socket). We can reasonable
* expect this to be small packet DoS attack to exhaust our CPU
* cycles.
*
* Care has to be taken for the minimum packet overload value. This
* value defines the minimum number of packets per second before we
* start to worry. This must not be too low to avoid killing for
* example interactive connections with many small packets like
* telnet or SSH.
*
* Setting either tcp_minmssoverload or tcp_minmss to "0" disables
* this check.
*
* Account for packet if payload packet, skip over ACK, etc.
*/
if (tcp_minmss && tcp_minmssoverload &&
tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
if (tp->rcv_second > ticks) {
tp->rcv_pps++;
tp->rcv_byps += tlen + off;
if (tp->rcv_pps > tcp_minmssoverload) {
if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) {
kprintf("too many small tcp packets from "
"%s:%u, av. %lubyte/packet, "
"dropping connection\n",
#ifdef INET6
isipv6 ?
ip6_sprintf(&inp->inp_inc.inc6_faddr) :
#endif
inet_ntoa(inp->inp_inc.inc_faddr),
inp->inp_inc.inc_fport,
tp->rcv_byps / tp->rcv_pps);
tp = tcp_drop(tp, ECONNRESET);
tcpstat.tcps_minmssdrops++;
goto drop;
}
}
} else {
tp->rcv_second = ticks + hz;
tp->rcv_pps = 1;
tp->rcv_byps = tlen + off;
}
}
/*
* Segment received on connection.
* Reset idle time and keep-alive timer.
*/
......
th->th_ack == tp->snd_una &&
LIST_EMPTY(&tp->t_segq) &&
tlen <= ssb_space(&so->so_rcv)) {
int newsize = 0; /* automatic sockbuf scaling */
/*
* This is a pure, in-sequence data packet
* with nothing on the reassembly queue and
......
tcpstat.tcps_rcvpack++;
tcpstat.tcps_rcvbyte += tlen;
ND6_HINT(tp); /* some progress has been done */
/*
* Automatic sizing of receive socket buffer. Often the send
* buffer size is not optimally adjusted to the actual network
* conditions at hand (delay bandwidth product). Setting the
* buffer size too small limits throughput on links with high
* bandwidth and high delay (eg. trans-continental/oceanic links).
*
* On the receive side the socket buffer memory is only rarely
* used to any significant extent. This allows us to be much
* more aggressive in scaling the receive socket buffer. For
* the case that the buffer space is actually used to a large
* extent and we run out of kernel memory we can simply drop
* the new segments; TCP on the sender will just retransmit it
* later. Setting the buffer size too big may only consume too
* much kernel memory if the application doesn't read() from
* the socket or packet loss or reordering makes use of the
* reassembly queue.
*
* The criteria to step up the receive buffer one notch are:
* 1. the number of bytes received during the time it takes
* one timestamp to be reflected back to us (the RTT);
* 2. received bytes per RTT is within seven eighth of the
* current socket buffer size;
* 3. receive buffer size has not hit maximal automatic size;
*
* This algorithm does one step per RTT at most and only if
* we receive a bulk stream w/o packet losses or reorderings.
* Shrinking the buffer during idle times is not necessary as
* it doesn't consume any memory when idle.
*
* TODO: Only step up if the application is actually serving
* the buffer to better manage the socket buffer resources.
*/
if (tcp_do_autorcvbuf &&
to.to_tsecr &&
(so->so_rcv.ssb_flags & SSB_AUTOSIZE)) {
if (to.to_tsecr > tp->rfbuf_ts &&
to.to_tsecr - tp->rfbuf_ts < hz) {
if (tp->rfbuf_cnt >
(so->so_rcv.ssb_hiwat / 8 * 7) &&
so->so_rcv.ssb_hiwat <
tcp_autorcvbuf_max) {
newsize =
min(so->so_rcv.ssb_hiwat +
tcp_autorcvbuf_inc,
tcp_autorcvbuf_max);
}
/* Start over with next RTT. */
tp->rfbuf_ts = 0;
tp->rfbuf_cnt = 0;
} else
tp->rfbuf_cnt += tlen; /* add up */
}
/*
* Add data to socket buffer.
*/
if (so->so_state & SS_CANTRCVMORE) {
m_freem(m);
} else {
/*
* Set new socket buffer size.
* Give up when limit is reached.
*/
if (newsize)
if (!ssb_reserve(&so->so_rcv, newsize,
so, NULL))
so->so_rcv.ssb_flags &= ~SSB_AUTOSIZE;
m_adj(m, drop_hdrlen); /* delayed header drop */
ssb_appendstream(&so->so_rcv, m);
}
......
recvwin = 0;
tp->rcv_wnd = imax(recvwin, (int)(tp->rcv_adv - tp->rcv_nxt));
/* Reset receive buffer auto scaling when not in bulk receive mode. */
tp->rfbuf_ts = 0;
tp->rfbuf_cnt = 0;
switch (tp->t_state) {
/*
* If the state is SYN_RECEIVED:
......
* Offer == 0 means that there was no MSS on the SYN segment,
* in this case we use tcp_mssdflt.
*/
if (offer == 0)
if (offer == 0) {
offer = (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
else
} else {
/*
* Prevent DoS attack with too small MSS. Round up
* to at least minmss.
*/
offer = max(offer, tcp_minmss);
/*
* Sanity check: make sure that maxopd will be large
* enough to allow some data on segments even is the
......
* funny things may happen in tcp_output.
*/
offer = max(offer, 64);
}
taop->tao_mssopt = offer;
/*
sys/netinet/tcp_output.c
SYSCTL_INT(_net_inet_tcp, OID_AUTO, avoid_pure_win_update, CTLFLAG_RW,
&avoid_pure_win_update, 1, "Avoid pure window updates when possible");
int tcp_do_autosndbuf = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
&tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
int tcp_autosndbuf_inc = 8*1024;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
&tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
int tcp_autosndbuf_max = 16*1024*1024;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
&tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
/*
* Tcp output routine: figure out what should be sent and send it.
*/
......
}
}
KASSERT(len >= 0, ("%s: len < 0", __func__));
/*
* len will be >= 0 after this point. Truncate to the maximum
* segment length and ensure that FIN is removed if the length
* no longer contains the last data byte.
* Automatic sizing of send socket buffer. Often the send buffer
* size is not optimally adjusted to the actual network conditions
* at hand (delay bandwidth product). Setting the buffer size too
* small limits throughput on links with high bandwidth and high
* delay (eg. trans-continental/oceanic links). Setting the
* buffer size too big consumes too much real kernel memory,
* especially with many connections on busy servers.
*
* The criteria to step up the send buffer one notch are:
* 1. receive window of remote host is larger than send buffer
* (with a fudge factor of 5/4th);
* 2. send buffer is filled to 7/8th with data (so we actually
* have data to make use of it);
* 3. send buffer fill has not hit maximal automatic size;
* 4. our send window (slow start and cogestion controlled) is
* larger than sent but unacknowledged data in send buffer.
*
* The remote host receive window scaling factor may limit the
* growing of the send buffer before it reaches its allowed
* maximum.
*
* It scales directly with slow start or congestion window
* and does at most one step per received ACK. This fast
* scaling has the drawback of growing the send buffer beyond
* what is strictly necessary to make full use of a given
* delay*bandwith product. However testing has shown this not
* to be much of an problem. At worst we are trading wasting
* of available bandwith (the non-use of it) for wasting some
* socket buffer memory.
*
* TODO: Shrink send buffer during idle periods together
* with congestion window. Requires another timer. Has to
* wait for upcoming tcp timer rewrite.
*/
if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) {
if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat &&
so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) &&
so->so_snd.ssb_cc < tcp_autosndbuf_max &&
sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) {
if (!ssb_reserve(&so->so_snd,
min(so->so_snd.ssb_hiwat + tcp_autosndbuf_inc,
tcp_autosndbuf_max), so, NULL))
so->so_snd.ssb_flags &= ~SSB_AUTOSIZE;
}
}
/*
* Truncate to the maximum segment length and ensure that FIN is
* removed if the length no longer contains the last data byte.
*/
if (len > tp->t_maxseg) {
len = tp->t_maxseg;
......
optlen += TCPOLEN_TSTAMP_APPA;
}
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
tp->rfbuf_ts = ticks;
/*
* Send `CC-family' options if our side wants to use them (TF_REQ_CC),
* options are allowed (!TF_NOOPT) and it's not a RST.
sys/netinet/tcp_subr.c
&tcp_v6mssdflt, 0, "Default TCP Maximum Segment Size for IPv6");
#endif
/*
* Minimum MSS we accept and use. This prevents DoS attacks where
* we are forced to a ridiculous low MSS like 20 and send hundreds
* of packets instead of one. The effect scales with the available
* bandwidth and quickly saturates the CPU and network interface
* with packet generation and sending. Set to zero to disable MINMSS
* checking. This setting prevents us from sending too small packets.
*/
int tcp_minmss = TCP_MINMSS;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
&tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
/*
* Number of TCP segments per second we accept from remote host
* before we start to calculate average segment size. If average
* segment size drops below the minimum TCP MSS we assume a DoS
* attack and reset+drop the connection. Care has to be taken not to
* set this value too small to not kill interactive type connections
* (telnet, SSH) which send many small packets.
*/
int tcp_minmssoverload = TCP_MINMSSOVERLOAD;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW,
&tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to"
"be under the MINMSS Size");
#if 0
static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
sys/netinet/tcp_usrreq.c
break;
case TCP_MAXSEG:
if (optval > 0 && optval <= tp->t_maxseg)
if (optval > 0 && optval <= tp->t_maxseg &&
optval + 40 >= tcp_minmss)
tp->t_maxseg = optval;
else
error = EINVAL;
......
if (error)
return (error);
}
so->so_rcv.ssb_flags |= SSB_AUTOSIZE;
so->so_snd.ssb_flags |= SSB_AUTOSIZE;
cpu = mycpu->gd_cpuid;
error = in_pcballoc(so, &tcbinfo[cpu]);
if (error)
sys/netinet/tcp_var.h
u_long snd_bwnd; /* bandwidth-controlled window */
int t_bw_rtttime; /* used for bandwidth calculation */
tcp_seq t_bw_rtseq; /* used for bandwidth calculation */
/* anti DoS counters */
u_long rcv_second; /* start of interval second */
u_long rcv_pps; /* received packets per second */
u_long rcv_byps; /* received bytes per second */
u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
int rfbuf_cnt; /* recv buffer autoscaling byte count */
};
#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
......
u_long tcps_connects; /* connections established */
u_long tcps_drops; /* connections dropped */
u_long tcps_conndrops; /* embryonic connections dropped */
u_long tcps_minmssdrops; /* average minmss too low drops */
u_long tcps_closed; /* conn. closed (includes drops) */
u_long tcps_segstimed; /* segs where we tried to get rtt */
u_long tcps_rttupdated; /* times we succeeded */
......
extern struct tcpcbackqhead tcpcbackq[];
extern int tcp_mssdflt; /* XXX */
extern int tcp_minmss;
extern int tcp_minmssoverload;
extern int tcp_delack_enabled;
extern int path_mtu_discovery;
sys/sys/socketvar.h
#define SSB_KNOTE 0x100 /* kernel note attached */
#define SSB_MEVENT 0x200 /* need message event notification */
#define SSB_STOP 0x400 /* backpressure indicator */
#define SSB_AUTOSIZE 0x800 /* automatically size socket buffer */
/*
* Per-socket kernel structure. Contains universal send and receive queues,
    (1-1/1)