Bug #1437 ยป 0001-Implement-autosizing-TCP-socket-buffers.patch
| sys/kern/uipc_socket.c | ||
|---|---|---|
|
ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE];
|
||
|
ai.p_ucred = p->p_ucred;
|
||
|
ai.fd_rdir = p->p_fd->fd_rdir;
|
||
|
/*
|
||
|
* Auto-sizing of socket buffers is managed by the protocols and
|
||
|
* the appropriate flags must be set in the pru_attach function.
|
||
|
*/
|
||
|
error = so_pru_attach(so, proto, &ai);
|
||
|
if (error) {
|
||
|
so->so_state |= SS_NOFDREF;
|
||
| ... | ... | |
|
error = ENOBUFS;
|
||
|
goto bad;
|
||
|
}
|
||
|
(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
|
||
|
&so->so_rcv)->ssb_flags &= ~SSB_AUTOSIZE;
|
||
|
break;
|
||
|
/*
|
||
| sys/kern/uipc_socket2.c | ||
|---|---|---|
|
so->so_linger = head->so_linger;
|
||
|
so->so_state = head->so_state | SS_NOFDREF;
|
||
|
so->so_proto = head->so_proto;
|
||
|
so->so_timeo = head->so_timeo;
|
||
|
so->so_cred = crhold(head->so_cred);
|
||
|
ai.sb_rlimit = NULL;
|
||
|
ai.p_ucred = NULL;
|
||
| ... | ... | |
|
sodealloc(so);
|
||
|
return (NULL);
|
||
|
}
|
||
|
so->so_rcv.ssb_lowat = head->so_rcv.ssb_lowat;
|
||
|
so->so_snd.ssb_lowat = head->so_snd.ssb_lowat;
|
||
|
so->so_rcv.ssb_timeo = head->so_rcv.ssb_timeo;
|
||
|
so->so_snd.ssb_timeo = head->so_snd.ssb_timeo;
|
||
|
so->so_rcv.ssb_flags |= head->so_rcv.ssb_flags & SSB_AUTOSIZE;
|
||
|
so->so_snd.ssb_flags |= head->so_snd.ssb_flags & SSB_AUTOSIZE;
|
||
|
if (connstatus) {
|
||
|
TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
|
||
|
so->so_state |= SS_COMP;
|
||
| sys/kern/uipc_usrreq.c | ||
|---|---|---|
|
!(so->so_proto->pr_flags & PR_RIGHTS))
|
||
|
return(0);
|
||
|
#ifdef notdef
|
||
|
if (so->so_rcv.sb_flags & SB_LOCK) {
|
||
|
if (so->so_rcv.ssb_flags & SSB_LOCK) {
|
||
|
/*
|
||
|
* This is problematical; it's not clear
|
||
|
* we need to wait for the sockbuf to be
|
||
| sys/netinet/tcp.h | ||
|---|---|---|
|
#define TCP_MSS 1460
|
||
|
/*
|
||
|
* TCP_MINMSS is defined to be 256 which is fine for the smallest
|
||
|
* link MTU (296 bytes, SLIP interface) in the Internet.
|
||
|
* However it is very unlikely to come across such low MTU interfaces
|
||
|
* these days (anno dato 2003).
|
||
|
* Probably it can be set to 512 without ill effects. But we play safe.
|
||
|
* See tcp_subr.c tcp_minmss SYSCTL declaration for more comments.
|
||
|
* Setting this to "0" disables the minmss check.
|
||
|
*/
|
||
|
#define TCP_MINMSS 256
|
||
|
/*
|
||
|
* TCP_MINMSSOVERLOAD is defined to be 1000 which should cover any type
|
||
|
* of interactive TCP session.
|
||
|
* See tcp_subr.c tcp_minmssoverload SYSCTL declaration and tcp_input.c
|
||
|
* for more comments.
|
||
|
* Setting this to "0" disables the minmssoverload check.
|
||
|
*/
|
||
|
#define TCP_MINMSSOVERLOAD 1000
|
||
|
/*
|
||
|
* Default maximum segment size for TCP6.
|
||
|
* With an IP6 MSS of 1280, this is 1220,
|
||
|
* but 1024 is probably more convenient. (xxx kazu in doubt)
|
||
| sys/netinet/tcp_input.c | ||
|---|---|---|
|
&tcp_reass_overflows, 0,
|
||
|
"Global number of TCP Segment Reassembly Queue Overflows");
|
||
|
int tcp_do_autorcvbuf = 1;
|
||
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
|
||
|
&tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
|
||
|
int tcp_autorcvbuf_inc = 16*1024;
|
||
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
|
||
|
&tcp_autorcvbuf_inc, 0,
|
||
|
"Incrementor step size of automatic receive buffer");
|
||
|
int tcp_autorcvbuf_max = 16*1024*1024;
|
||
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
|
||
|
&tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
|
||
|
static void tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t);
|
||
|
static void tcp_pulloutofband(struct socket *,
|
||
|
struct tcphdr *, struct mbuf *, int);
|
||
| ... | ... | |
|
KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN state"));
|
||
|
/*
|
||
|
* This is the second part of the MSS DoS prevention code (after
|
||
|
* minmss on the sending side) and it deals with too many too small
|
||
|
* tcp packets in a too short timeframe (1 second).
|
||
|
*
|
||
|
* For every full second we count the number of received packets
|
||
|
* and bytes. If we get a lot of packets per second for this connection
|
||
|
* (tcp_minmssoverload) we take a closer look at it and compute the
|
||
|
* average packet size for the past second. If that is less than
|
||
|
* tcp_minmss we get too many packets with very small payload which
|
||
|
* is not good and burdens our system (and every packet generates
|
||
|
* a wakeup to the process connected to our socket). We can reasonable
|
||
|
* expect this to be small packet DoS attack to exhaust our CPU
|
||
|
* cycles.
|
||
|
*
|
||
|
* Care has to be taken for the minimum packet overload value. This
|
||
|
* value defines the minimum number of packets per second before we
|
||
|
* start to worry. This must not be too low to avoid killing for
|
||
|
* example interactive connections with many small packets like
|
||
|
* telnet or SSH.
|
||
|
*
|
||
|
* Setting either tcp_minmssoverload or tcp_minmss to "0" disables
|
||
|
* this check.
|
||
|
*
|
||
|
* Account for packet if payload packet, skip over ACK, etc.
|
||
|
*/
|
||
|
if (tcp_minmss && tcp_minmssoverload &&
|
||
|
tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
|
||
|
if (tp->rcv_second > ticks) {
|
||
|
tp->rcv_pps++;
|
||
|
tp->rcv_byps += tlen + off;
|
||
|
if (tp->rcv_pps > tcp_minmssoverload) {
|
||
|
if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) {
|
||
|
kprintf("too many small tcp packets from "
|
||
|
"%s:%u, av. %lubyte/packet, "
|
||
|
"dropping connection\n",
|
||
|
#ifdef INET6
|
||
|
isipv6 ?
|
||
|
ip6_sprintf(&inp->inp_inc.inc6_faddr) :
|
||
|
#endif
|
||
|
inet_ntoa(inp->inp_inc.inc_faddr),
|
||
|
inp->inp_inc.inc_fport,
|
||
|
tp->rcv_byps / tp->rcv_pps);
|
||
|
tp = tcp_drop(tp, ECONNRESET);
|
||
|
tcpstat.tcps_minmssdrops++;
|
||
|
goto drop;
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
tp->rcv_second = ticks + hz;
|
||
|
tp->rcv_pps = 1;
|
||
|
tp->rcv_byps = tlen + off;
|
||
|
}
|
||
|
}
|
||
|
/*
|
||
|
* Segment received on connection.
|
||
|
* Reset idle time and keep-alive timer.
|
||
|
*/
|
||
| ... | ... | |
|
th->th_ack == tp->snd_una &&
|
||
|
LIST_EMPTY(&tp->t_segq) &&
|
||
|
tlen <= ssb_space(&so->so_rcv)) {
|
||
|
int newsize = 0; /* automatic sockbuf scaling */
|
||
|
/*
|
||
|
* This is a pure, in-sequence data packet
|
||
|
* with nothing on the reassembly queue and
|
||
| ... | ... | |
|
tcpstat.tcps_rcvpack++;
|
||
|
tcpstat.tcps_rcvbyte += tlen;
|
||
|
ND6_HINT(tp); /* some progress has been done */
|
||
|
/*
|
||
|
* Automatic sizing of receive socket buffer. Often the send
|
||
|
* buffer size is not optimally adjusted to the actual network
|
||
|
* conditions at hand (delay bandwidth product). Setting the
|
||
|
* buffer size too small limits throughput on links with high
|
||
|
* bandwidth and high delay (eg. trans-continental/oceanic links).
|
||
|
*
|
||
|
* On the receive side the socket buffer memory is only rarely
|
||
|
* used to any significant extent. This allows us to be much
|
||
|
* more aggressive in scaling the receive socket buffer. For
|
||
|
* the case that the buffer space is actually used to a large
|
||
|
* extent and we run out of kernel memory we can simply drop
|
||
|
* the new segments; TCP on the sender will just retransmit it
|
||
|
* later. Setting the buffer size too big may only consume too
|
||
|
* much kernel memory if the application doesn't read() from
|
||
|
* the socket or packet loss or reordering makes use of the
|
||
|
* reassembly queue.
|
||
|
*
|
||
|
* The criteria to step up the receive buffer one notch are:
|
||
|
* 1. the number of bytes received during the time it takes
|
||
|
* one timestamp to be reflected back to us (the RTT);
|
||
|
* 2. received bytes per RTT is within seven eighth of the
|
||
|
* current socket buffer size;
|
||
|
* 3. receive buffer size has not hit maximal automatic size;
|
||
|
*
|
||
|
* This algorithm does one step per RTT at most and only if
|
||
|
* we receive a bulk stream w/o packet losses or reorderings.
|
||
|
* Shrinking the buffer during idle times is not necessary as
|
||
|
* it doesn't consume any memory when idle.
|
||
|
*
|
||
|
* TODO: Only step up if the application is actually serving
|
||
|
* the buffer to better manage the socket buffer resources.
|
||
|
*/
|
||
|
if (tcp_do_autorcvbuf &&
|
||
|
to.to_tsecr &&
|
||
|
(so->so_rcv.ssb_flags & SSB_AUTOSIZE)) {
|
||
|
if (to.to_tsecr > tp->rfbuf_ts &&
|
||
|
to.to_tsecr - tp->rfbuf_ts < hz) {
|
||
|
if (tp->rfbuf_cnt >
|
||
|
(so->so_rcv.ssb_hiwat / 8 * 7) &&
|
||
|
so->so_rcv.ssb_hiwat <
|
||
|
tcp_autorcvbuf_max) {
|
||
|
newsize =
|
||
|
min(so->so_rcv.ssb_hiwat +
|
||
|
tcp_autorcvbuf_inc,
|
||
|
tcp_autorcvbuf_max);
|
||
|
}
|
||
|
/* Start over with next RTT. */
|
||
|
tp->rfbuf_ts = 0;
|
||
|
tp->rfbuf_cnt = 0;
|
||
|
} else
|
||
|
tp->rfbuf_cnt += tlen; /* add up */
|
||
|
}
|
||
|
/*
|
||
|
* Add data to socket buffer.
|
||
|
*/
|
||
|
if (so->so_state & SS_CANTRCVMORE) {
|
||
|
m_freem(m);
|
||
|
} else {
|
||
|
/*
|
||
|
* Set new socket buffer size.
|
||
|
* Give up when limit is reached.
|
||
|
*/
|
||
|
if (newsize)
|
||
|
if (!ssb_reserve(&so->so_rcv, newsize,
|
||
|
so, NULL))
|
||
|
so->so_rcv.ssb_flags &= ~SSB_AUTOSIZE;
|
||
|
m_adj(m, drop_hdrlen); /* delayed header drop */
|
||
|
ssb_appendstream(&so->so_rcv, m);
|
||
|
}
|
||
| ... | ... | |
|
recvwin = 0;
|
||
|
tp->rcv_wnd = imax(recvwin, (int)(tp->rcv_adv - tp->rcv_nxt));
|
||
|
/* Reset receive buffer auto scaling when not in bulk receive mode. */
|
||
|
tp->rfbuf_ts = 0;
|
||
|
tp->rfbuf_cnt = 0;
|
||
|
switch (tp->t_state) {
|
||
|
/*
|
||
|
* If the state is SYN_RECEIVED:
|
||
| ... | ... | |
|
* Offer == 0 means that there was no MSS on the SYN segment,
|
||
|
* in this case we use tcp_mssdflt.
|
||
|
*/
|
||
|
if (offer == 0)
|
||
|
if (offer == 0) {
|
||
|
offer = (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
|
||
|
else
|
||
|
} else {
|
||
|
/*
|
||
|
* Prevent DoS attack with too small MSS. Round up
|
||
|
* to at least minmss.
|
||
|
*/
|
||
|
offer = max(offer, tcp_minmss);
|
||
|
/*
|
||
|
* Sanity check: make sure that maxopd will be large
|
||
|
* enough to allow some data on segments even is the
|
||
| ... | ... | |
|
* funny things may happen in tcp_output.
|
||
|
*/
|
||
|
offer = max(offer, 64);
|
||
|
}
|
||
|
taop->tao_mssopt = offer;
|
||
|
/*
|
||
| sys/netinet/tcp_output.c | ||
|---|---|---|
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, avoid_pure_win_update, CTLFLAG_RW,
|
||
|
&avoid_pure_win_update, 1, "Avoid pure window updates when possible");
|
||
|
int tcp_do_autosndbuf = 1;
|
||
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
|
||
|
&tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
|
||
|
int tcp_autosndbuf_inc = 8*1024;
|
||
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
|
||
|
&tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
|
||
|
int tcp_autosndbuf_max = 16*1024*1024;
|
||
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
|
||
|
&tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
|
||
|
/*
|
||
|
* Tcp output routine: figure out what should be sent and send it.
|
||
|
*/
|
||
| ... | ... | |
|
}
|
||
|
}
|
||
|
KASSERT(len >= 0, ("%s: len < 0", __func__));
|
||
|
/*
|
||
|
* len will be >= 0 after this point. Truncate to the maximum
|
||
|
* segment length and ensure that FIN is removed if the length
|
||
|
* no longer contains the last data byte.
|
||
|
* Automatic sizing of send socket buffer. Often the send buffer
|
||
|
* size is not optimally adjusted to the actual network conditions
|
||
|
* at hand (delay bandwidth product). Setting the buffer size too
|
||
|
* small limits throughput on links with high bandwidth and high
|
||
|
* delay (eg. trans-continental/oceanic links). Setting the
|
||
|
* buffer size too big consumes too much real kernel memory,
|
||
|
* especially with many connections on busy servers.
|
||
|
*
|
||
|
* The criteria to step up the send buffer one notch are:
|
||
|
* 1. receive window of remote host is larger than send buffer
|
||
|
* (with a fudge factor of 5/4th);
|
||
|
* 2. send buffer is filled to 7/8th with data (so we actually
|
||
|
* have data to make use of it);
|
||
|
* 3. send buffer fill has not hit maximal automatic size;
|
||
|
* 4. our send window (slow start and cogestion controlled) is
|
||
|
* larger than sent but unacknowledged data in send buffer.
|
||
|
*
|
||
|
* The remote host receive window scaling factor may limit the
|
||
|
* growing of the send buffer before it reaches its allowed
|
||
|
* maximum.
|
||
|
*
|
||
|
* It scales directly with slow start or congestion window
|
||
|
* and does at most one step per received ACK. This fast
|
||
|
* scaling has the drawback of growing the send buffer beyond
|
||
|
* what is strictly necessary to make full use of a given
|
||
|
* delay*bandwith product. However testing has shown this not
|
||
|
* to be much of an problem. At worst we are trading wasting
|
||
|
* of available bandwith (the non-use of it) for wasting some
|
||
|
* socket buffer memory.
|
||
|
*
|
||
|
* TODO: Shrink send buffer during idle periods together
|
||
|
* with congestion window. Requires another timer. Has to
|
||
|
* wait for upcoming tcp timer rewrite.
|
||
|
*/
|
||
|
if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) {
|
||
|
if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat &&
|
||
|
so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) &&
|
||
|
so->so_snd.ssb_cc < tcp_autosndbuf_max &&
|
||
|
sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) {
|
||
|
if (!ssb_reserve(&so->so_snd,
|
||
|
min(so->so_snd.ssb_hiwat + tcp_autosndbuf_inc,
|
||
|
tcp_autosndbuf_max), so, NULL))
|
||
|
so->so_snd.ssb_flags &= ~SSB_AUTOSIZE;
|
||
|
}
|
||
|
}
|
||
|
/*
|
||
|
* Truncate to the maximum segment length and ensure that FIN is
|
||
|
* removed if the length no longer contains the last data byte.
|
||
|
*/
|
||
|
if (len > tp->t_maxseg) {
|
||
|
len = tp->t_maxseg;
|
||
| ... | ... | |
|
optlen += TCPOLEN_TSTAMP_APPA;
|
||
|
}
|
||
|
/* Set receive buffer autosizing timestamp. */
|
||
|
if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
|
||
|
tp->rfbuf_ts = ticks;
|
||
|
/*
|
||
|
* Send `CC-family' options if our side wants to use them (TF_REQ_CC),
|
||
|
* options are allowed (!TF_NOOPT) and it's not a RST.
|
||
| sys/netinet/tcp_subr.c | ||
|---|---|---|
|
&tcp_v6mssdflt, 0, "Default TCP Maximum Segment Size for IPv6");
|
||
|
#endif
|
||
|
/*
|
||
|
* Minimum MSS we accept and use. This prevents DoS attacks where
|
||
|
* we are forced to a ridiculous low MSS like 20 and send hundreds
|
||
|
* of packets instead of one. The effect scales with the available
|
||
|
* bandwidth and quickly saturates the CPU and network interface
|
||
|
* with packet generation and sending. Set to zero to disable MINMSS
|
||
|
* checking. This setting prevents us from sending too small packets.
|
||
|
*/
|
||
|
int tcp_minmss = TCP_MINMSS;
|
||
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
|
||
|
&tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
|
||
|
/*
|
||
|
* Number of TCP segments per second we accept from remote host
|
||
|
* before we start to calculate average segment size. If average
|
||
|
* segment size drops below the minimum TCP MSS we assume a DoS
|
||
|
* attack and reset+drop the connection. Care has to be taken not to
|
||
|
* set this value too small to not kill interactive type connections
|
||
|
* (telnet, SSH) which send many small packets.
|
||
|
*/
|
||
|
int tcp_minmssoverload = TCP_MINMSSOVERLOAD;
|
||
|
SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW,
|
||
|
&tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to"
|
||
|
"be under the MINMSS Size");
|
||
|
#if 0
|
||
|
static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
|
||
|
SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
|
||
| sys/netinet/tcp_usrreq.c | ||
|---|---|---|
|
break;
|
||
|
case TCP_MAXSEG:
|
||
|
if (optval > 0 && optval <= tp->t_maxseg)
|
||
|
if (optval > 0 && optval <= tp->t_maxseg &&
|
||
|
optval + 40 >= tcp_minmss)
|
||
|
tp->t_maxseg = optval;
|
||
|
else
|
||
|
error = EINVAL;
|
||
| ... | ... | |
|
if (error)
|
||
|
return (error);
|
||
|
}
|
||
|
so->so_rcv.ssb_flags |= SSB_AUTOSIZE;
|
||
|
so->so_snd.ssb_flags |= SSB_AUTOSIZE;
|
||
|
cpu = mycpu->gd_cpuid;
|
||
|
error = in_pcballoc(so, &tcbinfo[cpu]);
|
||
|
if (error)
|
||
| sys/netinet/tcp_var.h | ||
|---|---|---|
|
u_long snd_bwnd; /* bandwidth-controlled window */
|
||
|
int t_bw_rtttime; /* used for bandwidth calculation */
|
||
|
tcp_seq t_bw_rtseq; /* used for bandwidth calculation */
|
||
|
/* anti DoS counters */
|
||
|
u_long rcv_second; /* start of interval second */
|
||
|
u_long rcv_pps; /* received packets per second */
|
||
|
u_long rcv_byps; /* received bytes per second */
|
||
|
u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
|
||
|
int rfbuf_cnt; /* recv buffer autoscaling byte count */
|
||
|
};
|
||
|
#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
|
||
| ... | ... | |
|
u_long tcps_connects; /* connections established */
|
||
|
u_long tcps_drops; /* connections dropped */
|
||
|
u_long tcps_conndrops; /* embryonic connections dropped */
|
||
|
u_long tcps_minmssdrops; /* average minmss too low drops */
|
||
|
u_long tcps_closed; /* conn. closed (includes drops) */
|
||
|
u_long tcps_segstimed; /* segs where we tried to get rtt */
|
||
|
u_long tcps_rttupdated; /* times we succeeded */
|
||
| ... | ... | |
|
extern struct tcpcbackqhead tcpcbackq[];
|
||
|
extern int tcp_mssdflt; /* XXX */
|
||
|
extern int tcp_minmss;
|
||
|
extern int tcp_minmssoverload;
|
||
|
extern int tcp_delack_enabled;
|
||
|
extern int path_mtu_discovery;
|
||
| sys/sys/socketvar.h | ||
|---|---|---|
|
#define SSB_KNOTE 0x100 /* kernel note attached */
|
||
|
#define SSB_MEVENT 0x200 /* need message event notification */
|
||
|
#define SSB_STOP 0x400 /* backpressure indicator */
|
||
|
#define SSB_AUTOSIZE 0x800 /* automatically size socket buffer */
|
||
|
/*
|
||
|
* Per-socket kernel structure. Contains universal send and receive queues,
|
||