Bug #1437 ยป 0001-Implement-autosizing-TCP-socket-buffers.patch
sys/kern/uipc_socket.c | ||
---|---|---|
ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE];
|
||
ai.p_ucred = p->p_ucred;
|
||
ai.fd_rdir = p->p_fd->fd_rdir;
|
||
/*
|
||
* Auto-sizing of socket buffers is managed by the protocols and
|
||
* the appropriate flags must be set in the pru_attach function.
|
||
*/
|
||
error = so_pru_attach(so, proto, &ai);
|
||
if (error) {
|
||
so->so_state |= SS_NOFDREF;
|
||
... | ... | |
error = ENOBUFS;
|
||
goto bad;
|
||
}
|
||
(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
|
||
&so->so_rcv)->ssb_flags &= ~SSB_AUTOSIZE;
|
||
break;
|
||
/*
|
sys/kern/uipc_socket2.c | ||
---|---|---|
so->so_linger = head->so_linger;
|
||
so->so_state = head->so_state | SS_NOFDREF;
|
||
so->so_proto = head->so_proto;
|
||
so->so_timeo = head->so_timeo;
|
||
so->so_cred = crhold(head->so_cred);
|
||
ai.sb_rlimit = NULL;
|
||
ai.p_ucred = NULL;
|
||
... | ... | |
sodealloc(so);
|
||
return (NULL);
|
||
}
|
||
so->so_rcv.ssb_lowat = head->so_rcv.ssb_lowat;
|
||
so->so_snd.ssb_lowat = head->so_snd.ssb_lowat;
|
||
so->so_rcv.ssb_timeo = head->so_rcv.ssb_timeo;
|
||
so->so_snd.ssb_timeo = head->so_snd.ssb_timeo;
|
||
so->so_rcv.ssb_flags |= head->so_rcv.ssb_flags & SSB_AUTOSIZE;
|
||
so->so_snd.ssb_flags |= head->so_snd.ssb_flags & SSB_AUTOSIZE;
|
||
if (connstatus) {
|
||
TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
|
||
so->so_state |= SS_COMP;
|
sys/kern/uipc_usrreq.c | ||
---|---|---|
!(so->so_proto->pr_flags & PR_RIGHTS))
|
||
return(0);
|
||
#ifdef notdef
|
||
if (so->so_rcv.sb_flags & SB_LOCK) {
|
||
if (so->so_rcv.ssb_flags & SSB_LOCK) {
|
||
/*
|
||
* This is problematical; it's not clear
|
||
* we need to wait for the sockbuf to be
|
sys/netinet/tcp.h | ||
---|---|---|
#define TCP_MSS 1460
|
||
/*
|
||
* TCP_MINMSS is defined to be 256 which is fine for the smallest
|
||
* link MTU (296 bytes, SLIP interface) in the Internet.
|
||
* However it is very unlikely to come across such low MTU interfaces
|
||
* these days (anno dato 2003).
|
||
* Probably it can be set to 512 without ill effects. But we play safe.
|
||
* See tcp_subr.c tcp_minmss SYSCTL declaration for more comments.
|
||
* Setting this to "0" disables the minmss check.
|
||
*/
|
||
#define TCP_MINMSS 256
|
||
/*
|
||
* TCP_MINMSSOVERLOAD is defined to be 1000 which should cover any type
|
||
* of interactive TCP session.
|
||
* See tcp_subr.c tcp_minmssoverload SYSCTL declaration and tcp_input.c
|
||
* for more comments.
|
||
* Setting this to "0" disables the minmssoverload check.
|
||
*/
|
||
#define TCP_MINMSSOVERLOAD 1000
|
||
/*
|
||
* Default maximum segment size for TCP6.
|
||
* With an IP6 MSS of 1280, this is 1220,
|
||
* but 1024 is probably more convenient. (xxx kazu in doubt)
|
sys/netinet/tcp_input.c | ||
---|---|---|
&tcp_reass_overflows, 0,
|
||
"Global number of TCP Segment Reassembly Queue Overflows");
|
||
int tcp_do_autorcvbuf = 1;
|
||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
|
||
&tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
|
||
int tcp_autorcvbuf_inc = 16*1024;
|
||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
|
||
&tcp_autorcvbuf_inc, 0,
|
||
"Incrementor step size of automatic receive buffer");
|
||
int tcp_autorcvbuf_max = 16*1024*1024;
|
||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
|
||
&tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
|
||
static void tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t);
|
||
static void tcp_pulloutofband(struct socket *,
|
||
struct tcphdr *, struct mbuf *, int);
|
||
... | ... | |
KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN state"));
|
||
/*
|
||
* This is the second part of the MSS DoS prevention code (after
|
||
* minmss on the sending side) and it deals with too many too small
|
||
* tcp packets in a too short timeframe (1 second).
|
||
*
|
||
* For every full second we count the number of received packets
|
||
* and bytes. If we get a lot of packets per second for this connection
|
||
* (tcp_minmssoverload) we take a closer look at it and compute the
|
||
* average packet size for the past second. If that is less than
|
||
* tcp_minmss we get too many packets with very small payload which
|
||
* is not good and burdens our system (and every packet generates
|
||
* a wakeup to the process connected to our socket). We can reasonable
|
||
* expect this to be small packet DoS attack to exhaust our CPU
|
||
* cycles.
|
||
*
|
||
* Care has to be taken for the minimum packet overload value. This
|
||
* value defines the minimum number of packets per second before we
|
||
* start to worry. This must not be too low to avoid killing for
|
||
* example interactive connections with many small packets like
|
||
* telnet or SSH.
|
||
*
|
||
* Setting either tcp_minmssoverload or tcp_minmss to "0" disables
|
||
* this check.
|
||
*
|
||
* Account for packet if payload packet, skip over ACK, etc.
|
||
*/
|
||
if (tcp_minmss && tcp_minmssoverload &&
|
||
tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
|
||
if (tp->rcv_second > ticks) {
|
||
tp->rcv_pps++;
|
||
tp->rcv_byps += tlen + off;
|
||
if (tp->rcv_pps > tcp_minmssoverload) {
|
||
if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) {
|
||
kprintf("too many small tcp packets from "
|
||
"%s:%u, av. %lubyte/packet, "
|
||
"dropping connection\n",
|
||
#ifdef INET6
|
||
isipv6 ?
|
||
ip6_sprintf(&inp->inp_inc.inc6_faddr) :
|
||
#endif
|
||
inet_ntoa(inp->inp_inc.inc_faddr),
|
||
inp->inp_inc.inc_fport,
|
||
tp->rcv_byps / tp->rcv_pps);
|
||
tp = tcp_drop(tp, ECONNRESET);
|
||
tcpstat.tcps_minmssdrops++;
|
||
goto drop;
|
||
}
|
||
}
|
||
} else {
|
||
tp->rcv_second = ticks + hz;
|
||
tp->rcv_pps = 1;
|
||
tp->rcv_byps = tlen + off;
|
||
}
|
||
}
|
||
/*
|
||
* Segment received on connection.
|
||
* Reset idle time and keep-alive timer.
|
||
*/
|
||
... | ... | |
th->th_ack == tp->snd_una &&
|
||
LIST_EMPTY(&tp->t_segq) &&
|
||
tlen <= ssb_space(&so->so_rcv)) {
|
||
int newsize = 0; /* automatic sockbuf scaling */
|
||
/*
|
||
* This is a pure, in-sequence data packet
|
||
* with nothing on the reassembly queue and
|
||
... | ... | |
tcpstat.tcps_rcvpack++;
|
||
tcpstat.tcps_rcvbyte += tlen;
|
||
ND6_HINT(tp); /* some progress has been done */
|
||
/*
|
||
* Automatic sizing of receive socket buffer. Often the send
|
||
* buffer size is not optimally adjusted to the actual network
|
||
* conditions at hand (delay bandwidth product). Setting the
|
||
* buffer size too small limits throughput on links with high
|
||
* bandwidth and high delay (eg. trans-continental/oceanic links).
|
||
*
|
||
* On the receive side the socket buffer memory is only rarely
|
||
* used to any significant extent. This allows us to be much
|
||
* more aggressive in scaling the receive socket buffer. For
|
||
* the case that the buffer space is actually used to a large
|
||
* extent and we run out of kernel memory we can simply drop
|
||
* the new segments; TCP on the sender will just retransmit it
|
||
* later. Setting the buffer size too big may only consume too
|
||
* much kernel memory if the application doesn't read() from
|
||
* the socket or packet loss or reordering makes use of the
|
||
* reassembly queue.
|
||
*
|
||
* The criteria to step up the receive buffer one notch are:
|
||
* 1. the number of bytes received during the time it takes
|
||
* one timestamp to be reflected back to us (the RTT);
|
||
* 2. received bytes per RTT is within seven eighth of the
|
||
* current socket buffer size;
|
||
* 3. receive buffer size has not hit maximal automatic size;
|
||
*
|
||
* This algorithm does one step per RTT at most and only if
|
||
* we receive a bulk stream w/o packet losses or reorderings.
|
||
* Shrinking the buffer during idle times is not necessary as
|
||
* it doesn't consume any memory when idle.
|
||
*
|
||
* TODO: Only step up if the application is actually serving
|
||
* the buffer to better manage the socket buffer resources.
|
||
*/
|
||
if (tcp_do_autorcvbuf &&
|
||
to.to_tsecr &&
|
||
(so->so_rcv.ssb_flags & SSB_AUTOSIZE)) {
|
||
if (to.to_tsecr > tp->rfbuf_ts &&
|
||
to.to_tsecr - tp->rfbuf_ts < hz) {
|
||
if (tp->rfbuf_cnt >
|
||
(so->so_rcv.ssb_hiwat / 8 * 7) &&
|
||
so->so_rcv.ssb_hiwat <
|
||
tcp_autorcvbuf_max) {
|
||
newsize =
|
||
min(so->so_rcv.ssb_hiwat +
|
||
tcp_autorcvbuf_inc,
|
||
tcp_autorcvbuf_max);
|
||
}
|
||
/* Start over with next RTT. */
|
||
tp->rfbuf_ts = 0;
|
||
tp->rfbuf_cnt = 0;
|
||
} else
|
||
tp->rfbuf_cnt += tlen; /* add up */
|
||
}
|
||
/*
|
||
* Add data to socket buffer.
|
||
*/
|
||
if (so->so_state & SS_CANTRCVMORE) {
|
||
m_freem(m);
|
||
} else {
|
||
/*
|
||
* Set new socket buffer size.
|
||
* Give up when limit is reached.
|
||
*/
|
||
if (newsize)
|
||
if (!ssb_reserve(&so->so_rcv, newsize,
|
||
so, NULL))
|
||
so->so_rcv.ssb_flags &= ~SSB_AUTOSIZE;
|
||
m_adj(m, drop_hdrlen); /* delayed header drop */
|
||
ssb_appendstream(&so->so_rcv, m);
|
||
}
|
||
... | ... | |
recvwin = 0;
|
||
tp->rcv_wnd = imax(recvwin, (int)(tp->rcv_adv - tp->rcv_nxt));
|
||
/* Reset receive buffer auto scaling when not in bulk receive mode. */
|
||
tp->rfbuf_ts = 0;
|
||
tp->rfbuf_cnt = 0;
|
||
switch (tp->t_state) {
|
||
/*
|
||
* If the state is SYN_RECEIVED:
|
||
... | ... | |
* Offer == 0 means that there was no MSS on the SYN segment,
|
||
* in this case we use tcp_mssdflt.
|
||
*/
|
||
if (offer == 0)
|
||
if (offer == 0) {
|
||
offer = (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
|
||
else
|
||
} else {
|
||
/*
|
||
* Prevent DoS attack with too small MSS. Round up
|
||
* to at least minmss.
|
||
*/
|
||
offer = max(offer, tcp_minmss);
|
||
/*
|
||
* Sanity check: make sure that maxopd will be large
|
||
* enough to allow some data on segments even is the
|
||
... | ... | |
* funny things may happen in tcp_output.
|
||
*/
|
||
offer = max(offer, 64);
|
||
}
|
||
taop->tao_mssopt = offer;
|
||
/*
|
sys/netinet/tcp_output.c | ||
---|---|---|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, avoid_pure_win_update, CTLFLAG_RW,
|
||
&avoid_pure_win_update, 1, "Avoid pure window updates when possible");
|
||
int tcp_do_autosndbuf = 1;
|
||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
|
||
&tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
|
||
int tcp_autosndbuf_inc = 8*1024;
|
||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
|
||
&tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
|
||
int tcp_autosndbuf_max = 16*1024*1024;
|
||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
|
||
&tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
|
||
/*
|
||
* Tcp output routine: figure out what should be sent and send it.
|
||
*/
|
||
... | ... | |
}
|
||
}
|
||
KASSERT(len >= 0, ("%s: len < 0", __func__));
|
||
/*
|
||
* len will be >= 0 after this point. Truncate to the maximum
|
||
* segment length and ensure that FIN is removed if the length
|
||
* no longer contains the last data byte.
|
||
* Automatic sizing of send socket buffer. Often the send buffer
|
||
* size is not optimally adjusted to the actual network conditions
|
||
* at hand (delay bandwidth product). Setting the buffer size too
|
||
* small limits throughput on links with high bandwidth and high
|
||
* delay (eg. trans-continental/oceanic links). Setting the
|
||
* buffer size too big consumes too much real kernel memory,
|
||
* especially with many connections on busy servers.
|
||
*
|
||
* The criteria to step up the send buffer one notch are:
|
||
* 1. receive window of remote host is larger than send buffer
|
||
* (with a fudge factor of 5/4th);
|
||
* 2. send buffer is filled to 7/8th with data (so we actually
|
||
* have data to make use of it);
|
||
* 3. send buffer fill has not hit maximal automatic size;
|
||
* 4. our send window (slow start and cogestion controlled) is
|
||
* larger than sent but unacknowledged data in send buffer.
|
||
*
|
||
* The remote host receive window scaling factor may limit the
|
||
* growing of the send buffer before it reaches its allowed
|
||
* maximum.
|
||
*
|
||
* It scales directly with slow start or congestion window
|
||
* and does at most one step per received ACK. This fast
|
||
* scaling has the drawback of growing the send buffer beyond
|
||
* what is strictly necessary to make full use of a given
|
||
* delay*bandwith product. However testing has shown this not
|
||
* to be much of an problem. At worst we are trading wasting
|
||
* of available bandwith (the non-use of it) for wasting some
|
||
* socket buffer memory.
|
||
*
|
||
* TODO: Shrink send buffer during idle periods together
|
||
* with congestion window. Requires another timer. Has to
|
||
* wait for upcoming tcp timer rewrite.
|
||
*/
|
||
if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) {
|
||
if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat &&
|
||
so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) &&
|
||
so->so_snd.ssb_cc < tcp_autosndbuf_max &&
|
||
sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) {
|
||
if (!ssb_reserve(&so->so_snd,
|
||
min(so->so_snd.ssb_hiwat + tcp_autosndbuf_inc,
|
||
tcp_autosndbuf_max), so, NULL))
|
||
so->so_snd.ssb_flags &= ~SSB_AUTOSIZE;
|
||
}
|
||
}
|
||
/*
|
||
* Truncate to the maximum segment length and ensure that FIN is
|
||
* removed if the length no longer contains the last data byte.
|
||
*/
|
||
if (len > tp->t_maxseg) {
|
||
len = tp->t_maxseg;
|
||
... | ... | |
optlen += TCPOLEN_TSTAMP_APPA;
|
||
}
|
||
/* Set receive buffer autosizing timestamp. */
|
||
if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
|
||
tp->rfbuf_ts = ticks;
|
||
/*
|
||
* Send `CC-family' options if our side wants to use them (TF_REQ_CC),
|
||
* options are allowed (!TF_NOOPT) and it's not a RST.
|
sys/netinet/tcp_subr.c | ||
---|---|---|
&tcp_v6mssdflt, 0, "Default TCP Maximum Segment Size for IPv6");
|
||
#endif
|
||
/*
|
||
* Minimum MSS we accept and use. This prevents DoS attacks where
|
||
* we are forced to a ridiculous low MSS like 20 and send hundreds
|
||
* of packets instead of one. The effect scales with the available
|
||
* bandwidth and quickly saturates the CPU and network interface
|
||
* with packet generation and sending. Set to zero to disable MINMSS
|
||
* checking. This setting prevents us from sending too small packets.
|
||
*/
|
||
int tcp_minmss = TCP_MINMSS;
|
||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
|
||
&tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
|
||
/*
|
||
* Number of TCP segments per second we accept from remote host
|
||
* before we start to calculate average segment size. If average
|
||
* segment size drops below the minimum TCP MSS we assume a DoS
|
||
* attack and reset+drop the connection. Care has to be taken not to
|
||
* set this value too small to not kill interactive type connections
|
||
* (telnet, SSH) which send many small packets.
|
||
*/
|
||
int tcp_minmssoverload = TCP_MINMSSOVERLOAD;
|
||
SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW,
|
||
&tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to"
|
||
"be under the MINMSS Size");
|
||
#if 0
|
||
static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
|
||
SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
|
sys/netinet/tcp_usrreq.c | ||
---|---|---|
break;
|
||
case TCP_MAXSEG:
|
||
if (optval > 0 && optval <= tp->t_maxseg)
|
||
if (optval > 0 && optval <= tp->t_maxseg &&
|
||
optval + 40 >= tcp_minmss)
|
||
tp->t_maxseg = optval;
|
||
else
|
||
error = EINVAL;
|
||
... | ... | |
if (error)
|
||
return (error);
|
||
}
|
||
so->so_rcv.ssb_flags |= SSB_AUTOSIZE;
|
||
so->so_snd.ssb_flags |= SSB_AUTOSIZE;
|
||
cpu = mycpu->gd_cpuid;
|
||
error = in_pcballoc(so, &tcbinfo[cpu]);
|
||
if (error)
|
sys/netinet/tcp_var.h | ||
---|---|---|
u_long snd_bwnd; /* bandwidth-controlled window */
|
||
int t_bw_rtttime; /* used for bandwidth calculation */
|
||
tcp_seq t_bw_rtseq; /* used for bandwidth calculation */
|
||
/* anti DoS counters */
|
||
u_long rcv_second; /* start of interval second */
|
||
u_long rcv_pps; /* received packets per second */
|
||
u_long rcv_byps; /* received bytes per second */
|
||
u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
|
||
int rfbuf_cnt; /* recv buffer autoscaling byte count */
|
||
};
|
||
#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
|
||
... | ... | |
u_long tcps_connects; /* connections established */
|
||
u_long tcps_drops; /* connections dropped */
|
||
u_long tcps_conndrops; /* embryonic connections dropped */
|
||
u_long tcps_minmssdrops; /* average minmss too low drops */
|
||
u_long tcps_closed; /* conn. closed (includes drops) */
|
||
u_long tcps_segstimed; /* segs where we tried to get rtt */
|
||
u_long tcps_rttupdated; /* times we succeeded */
|
||
... | ... | |
extern struct tcpcbackqhead tcpcbackq[];
|
||
extern int tcp_mssdflt; /* XXX */
|
||
extern int tcp_minmss;
|
||
extern int tcp_minmssoverload;
|
||
extern int tcp_delack_enabled;
|
||
extern int path_mtu_discovery;
|
||
sys/sys/socketvar.h | ||
---|---|---|
#define SSB_KNOTE 0x100 /* kernel note attached */
|
||
#define SSB_MEVENT 0x200 /* need message event notification */
|
||
#define SSB_STOP 0x400 /* backpressure indicator */
|
||
#define SSB_AUTOSIZE 0x800 /* automatically size socket buffer */
|
||
/*
|
||
* Per-socket kernel structure. Contains universal send and receive queues,
|