0001-Implement-autosizing-TCP-socket-buffers.patch - DragonFlyBSD - DragonFlyBSD bugtracker

     	ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE];
     	ai.p_ucred = p->p_ucred;
     	ai.fd_rdir = p->p_fd->fd_rdir;
     	/*
     	 * Auto-sizing of socket buffers is managed by the protocols and
     	 * the appropriate flags must be set in the pru_attach function.
     	 */
     	error = so_pru_attach(so, proto, &ai);
     	if (error) {
     		so->so_state |= SS_NOFDREF;
-...
     					error = ENOBUFS;
     					goto bad;
+    				}
     				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
     				    &so->so_rcv)->ssb_flags &= ~SSB_AUTOSIZE;
     				break;
     			/*

     	so->so_linger = head->so_linger;
     	so->so_state = head->so_state | SS_NOFDREF;
     	so->so_proto = head->so_proto;
     	so->so_timeo = head->so_timeo;
     	so->so_cred = crhold(head->so_cred);
     	ai.sb_rlimit = NULL;
     	ai.p_ucred = NULL;
-...
     		sodealloc(so);
     		return (NULL);
+    	}
     	so->so_rcv.ssb_lowat = head->so_rcv.ssb_lowat;
     	so->so_snd.ssb_lowat = head->so_snd.ssb_lowat;
     	so->so_rcv.ssb_timeo = head->so_rcv.ssb_timeo;
     	so->so_snd.ssb_timeo = head->so_snd.ssb_timeo;
     	so->so_rcv.ssb_flags |= head->so_rcv.ssb_flags & SSB_AUTOSIZE;
     	so->so_snd.ssb_flags |= head->so_snd.ssb_flags & SSB_AUTOSIZE;
     	if (connstatus) {
     		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
     		so->so_state |= SS_COMP;

     	    !(so->so_proto->pr_flags & PR_RIGHTS))
     		return(0);
     #ifdef notdef
     	if (so->so_rcv.sb_flags & SB_LOCK) {
     	if (so->so_rcv.ssb_flags & SSB_LOCK) {
     		/*
     		 * This is problematical; it's not clear
     		 * we need to wait for the sockbuf to be

     #define	TCP_MSS	1460
     /*
      * TCP_MINMSS is defined to be 256 which is fine for the smallest
      * link MTU (296 bytes, SLIP interface) in the Internet.
      * However it is very unlikely to come across such low MTU interfaces
      * these days (anno dato 2003).
      * Probably it can be set to 512 without ill effects. But we play safe.
      * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments.
      * Setting this to "0" disables the minmss check.
      */
     #define	TCP_MINMSS 256
     /*
      * TCP_MINMSSOVERLOAD is defined to be 1000 which should cover any type
      * of interactive TCP session.
      * See tcp_subr.c tcp_minmssoverload SYSCTL declaration and tcp_input.c
      * for more comments.
      * Setting this to "0" disables the minmssoverload check.
      */
     #define	TCP_MINMSSOVERLOAD 1000
     /*
      * Default maximum segment size for TCP6.
      * With an IP6 MSS of 1280, this is 1220,
      * but 1024 is probably more convenient. (xxx kazu in doubt)

         &tcp_reass_overflows, 0,
         "Global number of TCP Segment Reassembly Queue Overflows");
     int tcp_do_autorcvbuf = 1;
     SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
         &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
     int tcp_autorcvbuf_inc = 16*1024;
     SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
         &tcp_autorcvbuf_inc, 0,
         "Incrementor step size of automatic receive buffer");
     int tcp_autorcvbuf_max = 16*1024*1024;
     SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
         &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
     static void	 tcp_dooptions(struct tcpopt *, u_char *, int, boolean_t);
     static void	 tcp_pulloutofband(struct socket *,
     		     struct tcphdr *, struct mbuf *, int);
-...
     	KASSERT(tp->t_state != TCPS_LISTEN, ("tcp_input: TCPS_LISTEN state"));
     	/*
     	 * This is the second part of the MSS DoS prevention code (after
     	 * minmss on the sending side) and it deals with too many too small
     	 * tcp packets in a too short timeframe (1 second).
+    	 *
     	 * For every full second we count the number of received packets
     	 * and bytes. If we get a lot of packets per second for this connection
     	 * (tcp_minmssoverload) we take a closer look at it and compute the
     	 * average packet size for the past second. If that is less than
     	 * tcp_minmss we get too many packets with very small payload which
     	 * is not good and burdens our system (and every packet generates
     	 * a wakeup to the process connected to our socket). We can reasonable
     	 * expect this to be small packet DoS attack to exhaust our CPU
     	 * cycles.
+    	 *
     	 * Care has to be taken for the minimum packet overload value. This
     	 * value defines the minimum number of packets per second before we
     	 * start to worry. This must not be too low to avoid killing for
     	 * example interactive connections with many small packets like
     	 * telnet or SSH.
+    	 *
     	 * Setting either tcp_minmssoverload or tcp_minmss to "0" disables
     	 * this check.
+    	 *
     	 * Account for packet if payload packet, skip over ACK, etc.
     	 */
     	if (tcp_minmss && tcp_minmssoverload &&
     	    tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
     		if (tp->rcv_second > ticks) {
     			tp->rcv_pps++;
     			tp->rcv_byps += tlen + off;
     			if (tp->rcv_pps > tcp_minmssoverload) {
     				if ((tp->rcv_byps / tp->rcv_pps) < tcp_minmss) {
     					kprintf("too many small tcp packets from "
     					       "%s:%u, av. %lubyte/packet, "
     					       "dropping connection\n",
     #ifdef INET6
     						isipv6 ?
     						ip6_sprintf(&inp->inp_inc.inc6_faddr) :
     #endif
     						inet_ntoa(inp->inp_inc.inc_faddr),
     						inp->inp_inc.inc_fport,
     						tp->rcv_byps / tp->rcv_pps);
     					tp = tcp_drop(tp, ECONNRESET);
     					tcpstat.tcps_minmssdrops++;
     					goto drop;
+    				}
+    			}
     		} else {
     			tp->rcv_second = ticks + hz;
     			tp->rcv_pps = 1;
     			tp->rcv_byps = tlen + off;
+    		}
+    	}
     	/*
     	 * Segment received on connection.
     	 * Reset idle time and keep-alive timer.
     	 */
-...
     		    th->th_ack == tp->snd_una &&
     		    LIST_EMPTY(&tp->t_segq) &&
     		    tlen <= ssb_space(&so->so_rcv)) {
     			int newsize = 0;	/* automatic sockbuf scaling */
     			/*
     			 * This is a pure, in-sequence data packet
     			 * with nothing on the reassembly queue and
-...
     			tcpstat.tcps_rcvpack++;
     			tcpstat.tcps_rcvbyte += tlen;
     			ND6_HINT(tp);	/* some progress has been done */
     		/*
     		 * Automatic sizing of receive socket buffer.  Often the send
     		 * buffer size is not optimally adjusted to the actual network
     		 * conditions at hand (delay bandwidth product).  Setting the
     		 * buffer size too small limits throughput on links with high
     		 * bandwidth and high delay (eg. trans-continental/oceanic links).
+    		 *
     		 * On the receive side the socket buffer memory is only rarely
     		 * used to any significant extent.  This allows us to be much
     		 * more aggressive in scaling the receive socket buffer.  For
     		 * the case that the buffer space is actually used to a large
     		 * extent and we run out of kernel memory we can simply drop
     		 * the new segments; TCP on the sender will just retransmit it
     		 * later.  Setting the buffer size too big may only consume too
     		 * much kernel memory if the application doesn't read() from
     		 * the socket or packet loss or reordering makes use of the
     		 * reassembly queue.
+    		 *
     		 * The criteria to step up the receive buffer one notch are:
     		 *  1. the number of bytes received during the time it takes
     		 *     one timestamp to be reflected back to us (the RTT);
     		 *  2. received bytes per RTT is within seven eighth of the
     		 *     current socket buffer size;
     		 *  3. receive buffer size has not hit maximal automatic size;
+    		 *
     		 * This algorithm does one step per RTT at most and only if
     		 * we receive a bulk stream w/o packet losses or reorderings.
     		 * Shrinking the buffer during idle times is not necessary as
     		 * it doesn't consume any memory when idle.
+    		 *
     		 * TODO: Only step up if the application is actually serving
     		 * the buffer to better manage the socket buffer resources.
     		 */
     			if (tcp_do_autorcvbuf &&
     			    to.to_tsecr &&
     			    (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) {
     				if (to.to_tsecr > tp->rfbuf_ts &&
     				    to.to_tsecr - tp->rfbuf_ts < hz) {
     					if (tp->rfbuf_cnt >
     					    (so->so_rcv.ssb_hiwat / 8 * 7) &&
     					    so->so_rcv.ssb_hiwat <
     					    tcp_autorcvbuf_max) {
     						newsize =
     						    min(so->so_rcv.ssb_hiwat +
     						    tcp_autorcvbuf_inc,
     						    tcp_autorcvbuf_max);
+    					}
     					/* Start over with next RTT. */
     					tp->rfbuf_ts = 0;
     					tp->rfbuf_cnt = 0;
     				} else
     					tp->rfbuf_cnt += tlen;	/* add up */
+    			}
     			/*
     			 * Add data to socket buffer.
     			 */
     			if (so->so_state & SS_CANTRCVMORE) {
     				m_freem(m);
     			} else {
     				/*
     				 * Set new socket buffer size.
     				 * Give up when limit is reached.
     				 */
     				if (newsize)
     					if (!ssb_reserve(&so->so_rcv, newsize,
     					    so, NULL))
     						so->so_rcv.ssb_flags &= ~SSB_AUTOSIZE;
     				m_adj(m, drop_hdrlen); /* delayed header drop */
     				ssb_appendstream(&so->so_rcv, m);
+    			}
-...
     		recvwin = 0;
     	tp->rcv_wnd = imax(recvwin, (int)(tp->rcv_adv - tp->rcv_nxt));
     	/* Reset receive buffer auto scaling when not in bulk receive mode. */
     	tp->rfbuf_ts = 0;
     	tp->rfbuf_cnt = 0;
     	switch (tp->t_state) {
     	/*
     	 * If the state is SYN_RECEIVED:
-...
     	 * Offer == 0 means that there was no MSS on the SYN segment,
     	 * in this case we use tcp_mssdflt.
     	 */
     	if (offer == 0)
     	if (offer == 0) {
     		offer = (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
     	else
     	} else {
     		/*
     		 * Prevent DoS attack with too small MSS. Round up
     		 * to at least minmss.
     		 */
     		offer = max(offer, tcp_minmss);
     		/*
     		 * Sanity check: make sure that maxopd will be large
     		 * enough to allow some data on segments even is the
-...
     		 * funny things may happen in tcp_output.
     		 */
     		offer = max(offer, 64);
+    	}
     	taop->tao_mssopt = offer;
     	/*

     SYSCTL_INT(_net_inet_tcp, OID_AUTO, avoid_pure_win_update, CTLFLAG_RW,
     	&avoid_pure_win_update, 1, "Avoid pure window updates when possible");
     int tcp_do_autosndbuf = 1;
     SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
         &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
     int tcp_autosndbuf_inc = 8*1024;
     SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
         &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
     int tcp_autosndbuf_max = 16*1024*1024;
     SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
         &tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
     /*
      * Tcp output routine: figure out what should be sent and send it.
      */
-...
+    		}
+    	}
     	KASSERT(len >= 0, ("%s: len < 0", __func__));
     	/*
     	 * len will be >= 0 after this point.  Truncate to the maximum
     	 * segment length and ensure that FIN is removed if the length
     	 * no longer contains the last data byte.
     	 * Automatic sizing of send socket buffer.  Often the send buffer
     	 * size is not optimally adjusted to the actual network conditions
     	 * at hand (delay bandwidth product).  Setting the buffer size too
     	 * small limits throughput on links with high bandwidth and high
     	 * delay (eg. trans-continental/oceanic links).  Setting the
     	 * buffer size too big consumes too much real kernel memory,
     	 * especially with many connections on busy servers.
+    	 *
     	 * The criteria to step up the send buffer one notch are:
     	 *  1. receive window of remote host is larger than send buffer
     	 *     (with a fudge factor of 5/4th);
     	 *  2. send buffer is filled to 7/8th with data (so we actually
     	 *     have data to make use of it);
     	 *  3. send buffer fill has not hit maximal automatic size;
     	 *  4. our send window (slow start and cogestion controlled) is
     	 *     larger than sent but unacknowledged data in send buffer.
+    	 *
     	 * The remote host receive window scaling factor may limit the
     	 * growing of the send buffer before it reaches its allowed
     	 * maximum.
+    	 *
     	 * It scales directly with slow start or congestion window
     	 * and does at most one step per received ACK.  This fast
     	 * scaling has the drawback of growing the send buffer beyond
     	 * what is strictly necessary to make full use of a given
     	 * delay*bandwith product.  However testing has shown this not
     	 * to be much of an problem.  At worst we are trading wasting
     	 * of available bandwith (the non-use of it) for wasting some
     	 * socket buffer memory.
+    	 *
     	 * TODO: Shrink send buffer during idle periods together
     	 * with congestion window.  Requires another timer.  Has to
     	 * wait for upcoming tcp timer rewrite.
     	 */
     	if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) {
     		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat &&
     		    so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) &&
     		    so->so_snd.ssb_cc < tcp_autosndbuf_max &&
     		    sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) {
     			if (!ssb_reserve(&so->so_snd,
     			    min(so->so_snd.ssb_hiwat + tcp_autosndbuf_inc,
     			     tcp_autosndbuf_max), so, NULL))
     				so->so_snd.ssb_flags &= ~SSB_AUTOSIZE;
+    		}
+    	}
     	/*
     	 * Truncate to the maximum segment length and ensure that FIN is
     	 * removed if the length no longer contains the last data byte.
     	 */
     	if (len > tp->t_maxseg) {
     		len = tp->t_maxseg;
-...
     		optlen += TCPOLEN_TSTAMP_APPA;
+    	}
     	/* Set receive buffer autosizing timestamp. */
     	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
     		tp->rfbuf_ts = ticks;
     	/*
     	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
     	 * options are allowed (!TF_NOOPT) and it's not a RST.

         &tcp_v6mssdflt, 0, "Default TCP Maximum Segment Size for IPv6");
     #endif
     /*
      * Minimum MSS we accept and use. This prevents DoS attacks where
      * we are forced to a ridiculous low MSS like 20 and send hundreds
      * of packets instead of one. The effect scales with the available
      * bandwidth and quickly saturates the CPU and network interface
      * with packet generation and sending. Set to zero to disable MINMSS
      * checking. This setting prevents us from sending too small packets.
      */
     int tcp_minmss = TCP_MINMSS;
     SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
         &tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
     /*
      * Number of TCP segments per second we accept from remote host
      * before we start to calculate average segment size. If average
      * segment size drops below the minimum TCP MSS we assume a DoS
      * attack and reset+drop the connection. Care has to be taken not to
      * set this value too small to not kill interactive type connections
      * (telnet, SSH) which send many small packets.
      */
     int tcp_minmssoverload = TCP_MINMSSOVERLOAD;
     SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW,
         &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to"
         "be under the MINMSS Size");
     #if 0
     static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
     SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,

     			break;
     		case TCP_MAXSEG:
     			if (optval > 0 && optval <= tp->t_maxseg)
     			if (optval > 0 && optval <= tp->t_maxseg &&
     			    optval + 40 >= tcp_minmss)
     				tp->t_maxseg = optval;
     			else
     				error = EINVAL;
-...
     		if (error)
     			return (error);
+    	}
     	so->so_rcv.ssb_flags |= SSB_AUTOSIZE;
     	so->so_snd.ssb_flags |= SSB_AUTOSIZE;
     	cpu = mycpu->gd_cpuid;
     	error = in_pcballoc(so, &tcbinfo[cpu]);
     	if (error)

     	u_long	snd_bwnd;		/* bandwidth-controlled window */
     	int	t_bw_rtttime;		/* used for bandwidth calculation */
     	tcp_seq	t_bw_rtseq;		/* used for bandwidth calculation */
     /* anti DoS counters */
     	u_long	rcv_second;		/* start of interval second */
     	u_long	rcv_pps;		/* received packets per second */
     	u_long	rcv_byps;		/* received bytes per second */
     	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
     	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
     };
     #define	IN_FASTRECOVERY(tp)	(tp->t_flags & TF_FASTRECOVERY)
-...
     	u_long	tcps_connects;		/* connections established */
     	u_long	tcps_drops;		/* connections dropped */
     	u_long	tcps_conndrops;		/* embryonic connections dropped */
     	u_long	tcps_minmssdrops;	/* average minmss too low drops */
     	u_long	tcps_closed;		/* conn. closed (includes drops) */
     	u_long	tcps_segstimed;		/* segs where we tried to get rtt */
     	u_long	tcps_rttupdated;	/* times we succeeded */
-...
     extern	struct tcpcbackqhead tcpcbackq[];
     extern	int tcp_mssdflt;	/* XXX */
     extern	int tcp_minmss;
     extern	int tcp_minmssoverload;
     extern	int tcp_delack_enabled;
     extern	int path_mtu_discovery;

sys/sys/socketvar.h
		#define SSB_KNOTE 0x100 /* kernel note attached */
		#define SSB_MEVENT 0x200 /* need message event notification */
		#define SSB_STOP 0x400 /* backpressure indicator */
		#define SSB_AUTOSIZE 0x800 /* automatically size socket buffer */

		/*
		* Per-socket kernel structure. Contains universal send and receive queues,

Project

General

Profile

DragonFlyBSD

Bug #1437 » 0001-Implement-autosizing-TCP-socket-buffers.patch